12 306 305 306 296 296 2 2 294 255 294 9 10 9 5 7 4 3 3 1 1 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <net/checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_UNREPLIED] = 30*HZ, [UDP_CT_REPLIED] = 120*HZ, }; static unsigned int *udp_get_timeouts(struct net *net) { return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udp_error_log(skb, state, "short packet"); return true; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { udp_error_log(skb, state, "truncated/malformed packet"); return true; } /* Packet with no checksum */ if (!hdr->check) return false; /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) { udp_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); status = READ_ONCE(ct->status); if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #ifdef CONFIG_NF_CT_PROTO_UDPLITE static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; unsigned int cscov; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udplite_error_log(skb, state, "short packet"); return true; } cscov = ntohs(hdr->len); if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { udplite_error_log(skb, state, "invalid checksum coverage"); return true; } /* UDPLITE mandates checksums */ if (!hdr->check) { udplite_error_log(skb, state, "checksum missing"); return true; } /* Checksum invalid? Ignore. */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, state->pf)) { udplite_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; if (udplite_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); if (unlikely((ct->status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { timeouts[UDP_CT_UNREPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; } if (tb[CTA_TIMEOUT_UDP_REPLIED]) { timeouts[UDP_CT_REPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; } return 0; } static int udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, htonl(timeouts[UDP_CT_REPLIED] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_udp_init_net(struct net *net) { struct nf_udp_net *un = nf_udp_pernet(net); int i; for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) un->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { .l4proto = IPPROTO_UDP, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #ifdef CONFIG_NF_CT_PROTO_UDPLITE const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { .l4proto = IPPROTO_UDPLITE, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #endif
27 20 20 20 33 33 20 27 7 20 18 18 32 1 3 3 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0-only /* * File: af_phonet.c * * Phonet protocols family * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <asm/unaligned.h> #include <net/sock.h> #include <linux/if_phonet.h> #include <linux/phonet.h> #include <net/phonet/phonet.h> #include <net/phonet/pn_dev.h> /* Transport protocol registration */ static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { const struct phonet_protocol *pp; if (protocol >= PHONET_NPROTO) return NULL; rcu_read_lock(); pp = rcu_dereference(proto_tab[protocol]); if (pp && !try_module_get(pp->prot->owner)) pp = NULL; rcu_read_unlock(); return pp; } static inline void phonet_proto_put(const struct phonet_protocol *pp) { module_put(pp->prot->owner); } /* protocol family functions */ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct pn_sock *pn; const struct phonet_protocol *pnp; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (protocol == 0) { /* Default protocol selection */ switch (sock->type) { case SOCK_DGRAM: protocol = PN_PROTO_PHONET; break; case SOCK_SEQPACKET: protocol = PN_PROTO_PIPE; break; default: return -EPROTONOSUPPORT; } } pnp = phonet_proto_get(protocol); if (pnp == NULL && request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0) pnp = phonet_proto_get(protocol); if (pnp == NULL) return -EPROTONOSUPPORT; if (sock->type != pnp->sock_type) { err = -EPROTONOSUPPORT; goto out; } sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; } sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = pnp->ops; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; sk->sk_protocol = protocol; pn = pn_sk(sk); pn->sobject = 0; pn->dobject = 0; pn->resource = 0; sk->sk_prot->init(sk); err = 0; out: phonet_proto_put(pnp); return err; } static const struct net_proto_family phonet_proto_family = { .family = PF_PHONET, .create = pn_socket_create, .owner = THIS_MODULE, }; /* Phonet device header operations */ static int pn_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { u8 *media = skb_push(skb, 1); if (type != ETH_P_PHONET) return -1; if (!saddr) saddr = dev->dev_addr; *media = *(const u8 *)saddr; return 1; } static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const u8 *media = skb_mac_header(skb); *haddr = *media; return 1; } const struct header_ops phonet_header_ops = { .create = pn_header_create, .parse = pn_header_parse, }; EXPORT_SYMBOL(phonet_header_ops); /* * Prepends an ISI header and sends a datagram. */ static int pn_send(struct sk_buff *skb, struct net_device *dev, u16 dst, u16 src, u8 res) { struct phonethdr *ph; int err; if (skb->len + 2 > 0xffff /* Phonet length field limit */ || skb->len + sizeof(struct phonethdr) > dev->mtu) { err = -EMSGSIZE; goto drop; } /* Broadcast sending is not implemented */ if (pn_addr(dst) == PNADDR_BROADCAST) { err = -EOPNOTSUPP; goto drop; } skb_reset_transport_header(skb); WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */ skb_push(skb, sizeof(struct phonethdr)); skb_reset_network_header(skb); ph = pn_hdr(skb); ph->pn_rdev = pn_dev(dst); ph->pn_sdev = pn_dev(src); ph->pn_res = res; ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph)); ph->pn_robj = pn_obj(dst); ph->pn_sobj = pn_obj(src); skb->protocol = htons(ETH_P_PHONET); skb->priority = 0; skb->dev = dev; if (skb->pkt_type == PACKET_LOOPBACK) { skb_reset_mac_header(skb); skb_orphan(skb); err = netif_rx(skb) ? -ENOBUFS : 0; } else { err = dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len); if (err < 0) { err = -EHOSTUNREACH; goto drop; } err = dev_queue_xmit(skb); if (unlikely(err > 0)) err = net_xmit_errno(err); } return err; drop: kfree_skb(skb); return err; } static int pn_raw_send(const void *data, int len, struct net_device *dev, u16 dst, u16 src, u8 res) { struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0) skb->pkt_type = PACKET_LOOPBACK; skb_reserve(skb, MAX_PHONET_HEADER); __skb_put(skb, len); skb_copy_to_linear_data(skb, data, len); return pn_send(skb, dev, dst, src, res); } /* * Create a Phonet header for the skb and send it out. Returns * non-zero error code if failed. The skb is freed then. */ int pn_skb_send(struct sock *sk, struct sk_buff *skb, const struct sockaddr_pn *target) { struct net *net = sock_net(sk); struct net_device *dev; struct pn_sock *pn = pn_sk(sk); int err; u16 src, dst; u8 daddr, saddr, res; src = pn->sobject; if (target != NULL) { dst = pn_sockaddr_get_object(target); res = pn_sockaddr_get_resource(target); } else { dst = pn->dobject; res = pn->resource; } daddr = pn_addr(dst); err = -EHOSTUNREACH; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); else if (phonet_address_lookup(net, daddr) == 0) { dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; } else if (dst == 0) { /* Resource routing (small race until phonet_rcv()) */ struct sock *sk = pn_find_sock_by_res(net, res); if (sk) { sock_put(sk); dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; } else dev = phonet_route_output(net, daddr); } else dev = phonet_route_output(net, daddr); if (!dev || !(dev->flags & IFF_UP)) goto drop; saddr = phonet_address_get(dev, daddr); if (saddr == PN_NO_ADDR) goto drop; if (!pn_addr(src)) src = pn_object(saddr, pn_obj(src)); err = pn_send(skb, dev, dst, src, res); dev_put(dev); return err; drop: kfree_skb(skb); dev_put(dev); return err; } EXPORT_SYMBOL(pn_skb_send); /* Do not send an error message in response to an error message */ static inline int can_respond(struct sk_buff *skb) { const struct phonethdr *ph; const struct phonetmsg *pm; u8 submsg_id; if (!pskb_may_pull(skb, 3)) return 0; ph = pn_hdr(skb); if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5)) return 0; if (ph->pn_res == PN_COMMGR) /* indications */ return 0; ph = pn_hdr(skb); /* re-acquires the pointer */ pm = pn_msg(skb); if (pm->pn_msg_id != PN_COMMON_MESSAGE) return 1; submsg_id = (ph->pn_res == PN_PREFIX) ? pm->pn_e_submsg_id : pm->pn_submsg_id; if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP && pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP) return 1; return 0; } static int send_obj_unreachable(struct sk_buff *rskb) { const struct phonethdr *oph = pn_hdr(rskb); const struct phonetmsg *opm = pn_msg(rskb); struct phonetmsg resp; memset(&resp, 0, sizeof(resp)); resp.pn_trans_id = opm->pn_trans_id; resp.pn_msg_id = PN_COMMON_MESSAGE; if (oph->pn_res == PN_PREFIX) { resp.pn_e_res_id = opm->pn_e_res_id; resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; resp.pn_e_orig_msg_id = opm->pn_msg_id; resp.pn_e_status = 0; } else { resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; resp.pn_orig_msg_id = opm->pn_msg_id; resp.pn_status = 0; } return pn_raw_send(&resp, sizeof(resp), rskb->dev, pn_object(oph->pn_sdev, oph->pn_sobj), pn_object(oph->pn_rdev, oph->pn_robj), oph->pn_res); } static int send_reset_indications(struct sk_buff *rskb) { struct phonethdr *oph = pn_hdr(rskb); static const u8 data[4] = { 0x00 /* trans ID */, 0x10 /* subscribe msg */, 0x00 /* subscription count */, 0x00 /* dummy */ }; return pn_raw_send(data, sizeof(data), rskb->dev, pn_object(oph->pn_sdev, 0x00), pn_object(oph->pn_rdev, oph->pn_robj), PN_COMMGR); } /* packet type functions */ /* * Stuff received packets to associated sockets. * On error, returns non-zero and releases the skb. */ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct net *net = dev_net(dev); struct phonethdr *ph; struct sockaddr_pn sa; u16 len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; /* check we have at least a full Phonet header */ if (!pskb_pull(skb, sizeof(struct phonethdr))) goto out; /* check that the advertised length is correct */ ph = pn_hdr(skb); len = get_unaligned_be16(&ph->pn_length); if (len < 2) goto out; len -= 2; if ((len > skb->len) || pskb_trim(skb, len)) goto out; skb_reset_transport_header(skb); pn_skb_get_dst_sockaddr(skb, &sa); /* check if this is broadcasted */ if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) { pn_deliver_sock_broadcast(net, skb); goto out; } /* resource routing */ if (pn_sockaddr_get_object(&sa) == 0) { struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource); if (sk) return sk_receive_skb(sk, skb, 0); } /* check if we are the destination */ if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { /* Phonet packet input */ struct sock *sk = pn_find_sock_by_sa(net, &sa); if (sk) return sk_receive_skb(sk, skb, 0); if (can_respond(skb)) { send_obj_unreachable(skb); send_reset_indications(skb); } } else if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) goto out; /* Race between address deletion and loopback */ else { /* Phonet packet routing */ struct net_device *out_dev; out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa)); if (!out_dev) { net_dbg_ratelimited("No Phonet route to %02X\n", pn_sockaddr_get_addr(&sa)); goto out; } __skb_push(skb, sizeof(struct phonethdr)); skb->dev = out_dev; if (out_dev == dev) { net_dbg_ratelimited("Phonet loop to %02X on %s\n", pn_sockaddr_get_addr(&sa), dev->name); goto out_dev; } /* Some drivers (e.g. TUN) do not allocate HW header space */ if (skb_cow_head(skb, out_dev->hard_header_len)) goto out_dev; if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL, skb->len) < 0) goto out_dev; dev_queue_xmit(skb); dev_put(out_dev); return NET_RX_SUCCESS; out_dev: dev_put(out_dev); } out: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type phonet_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_PHONET), .func = phonet_rcv, }; static DEFINE_MUTEX(proto_tab_lock); int __init_or_module phonet_proto_register(unsigned int protocol, const struct phonet_protocol *pp) { int err = 0; if (protocol >= PHONET_NPROTO) return -EINVAL; err = proto_register(pp->prot, 1); if (err) return err; mutex_lock(&proto_tab_lock); if (proto_tab[protocol]) err = -EBUSY; else rcu_assign_pointer(proto_tab[protocol], pp); mutex_unlock(&proto_tab_lock); return err; } EXPORT_SYMBOL(phonet_proto_register); void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); BUG_ON(proto_tab[protocol] != pp); RCU_INIT_POINTER(proto_tab[protocol], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(pp->prot); } EXPORT_SYMBOL(phonet_proto_unregister); /* Module registration */ static int __init phonet_init(void) { int err; err = phonet_device_init(); if (err) return err; pn_sock_init(); err = sock_register(&phonet_proto_family); if (err) { printk(KERN_ALERT "phonet protocol family initialization failed\n"); goto err_sock; } dev_add_pack(&phonet_packet_type); phonet_sysctl_init(); err = isi_register(); if (err) goto err; return 0; err: phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); err_sock: phonet_device_exit(); return err; } static void __exit phonet_exit(void) { isi_unregister(); phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); phonet_device_exit(); } module_init(phonet_init); module_exit(phonet_exit); MODULE_DESCRIPTION("Phonet protocol stack for Linux"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PHONET);
1427 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/rculist.h> #include <linux/types.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_set_palette: sets the palette of the console to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, int init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, int sy, int sx, int height, int width); void (*con_putc)(struct vc_data *vc, int c, int ypos, int xpos); void (*con_putcs)(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); void (*con_cursor)(struct vc_data *vc, int mode); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); int (*con_switch)(struct vc_data *vc); int (*con_blank)(struct vc_data *vc, int blank, int mode_switch); int (*con_font_set)(struct vc_data *vc, struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, unsigned int user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); int (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); u16 *(*con_screen_pos)(const struct vc_data *vc, int offset); unsigned long (*con_getxy)(struct vc_data *vc, unsigned long position, int *px, int *py); /* * Flush the video console driver's scrollback buffer */ void (*con_flush_scrollback)(struct vc_data *vc); /* * Prepare the console for the debugger. This includes, but is not * limited to, unblanking the console, loading an appropriate * palette, and allowing debugger generated output. */ int (*con_debug_enter)(struct vc_data *vc); /* * Restore the console to its pre-debug state as closely as possible. */ int (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_HW_CONSOLE int con_debug_enter(struct vc_data *vc); int con_debug_leave(void); #else static inline int con_debug_enter(struct vc_data *vc) { return 0; } static inline int con_debug_leave(void) { return 0; } #endif /* cursor */ #define CM_DRAW (1) #define CM_ERASE (2) #define CM_MOVE (3) /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read the console flags * @con: struct console pointer of console to read flags from * * This function provides the necessary READ_ONCE() and data_race() * notation for locklessly reading the console flags. The READ_ONCE() * in this function matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). * * Only use this function to read console flags when locklessly * iterating the console list via srcu. * * Context: Any context. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * Locklessly reading console->flags provides a consistent * read value because there is at most one CPU modifying * console->flags and that CPU is using only read-modify-write * operations to do so. */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the console->flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(char *name, int idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; /* VESA Blanking Levels */ #define VESA_NO_BLANKING 0 #define VESA_VSYNC_SUSPEND 1 #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */
588 588 169 588 588 169 585 586 586 586 581 581 581 581 581 581 586 585 586 586 172 172 583 583 582 46 583 583 586 172 172 583 41 583 582 583 583 583 583 583 41 41 588 583 588 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; if (bio->bi_status) folio_clear_uptodate(folio); else folio_mark_uptodate(folio); folio_unlock(folio); } if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; if (rac) folio = readahead_folio(rac); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk + map_offset + relative_block; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_set_error(folio); folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map.m_pblk-1) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_mark_uptodate(folio); folio_unlock(folio); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); }
308 433 432 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * * Common inline functions needed for truncate support */ /* * Truncate blocks that were not used by write. We have to truncate the * pagecache as well so that corresponding buffers get properly unmapped. */ static inline void ext4_truncate_failed_write(struct inode *inode) { struct address_space *mapping = inode->i_mapping; /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ filemap_invalidate_lock(mapping); truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); filemap_invalidate_unlock(mapping); } /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) { ext4_lblk_t needed; needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); /* Give ourselves just enough room to cope with inodes in which * i_blocks is corrupt: we've seen disk corruptions in the past * which resulted in random data in an inode which looked enough * like a regular file for ext4 to try to delete it. Things * will go a bit crazy if that happens, but at least we should * try not to panic the whole kernel. */ if (needed < 2) needed = 2; /* But we need to bound the transaction so we don't overflow the * journal. */ if (needed > EXT4_MAX_TRANS_DATA) needed = EXT4_MAX_TRANS_DATA; return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; }
167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ICMPV6_H #define _LINUX_ICMPV6_H #include <linux/skbuff.h> #include <linux/ipv6.h> #include <uapi/linux/icmpv6.h> static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) { return (struct icmp6hdr *)skb_transport_header(skb); } #include <linux/netdevice.h> #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } #else extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { __icmpv6_send(skb, type, code, info, IP6CB(skb)); } int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; __icmpv6_send(skb_in, type, code, info, &parm); } #endif #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } static inline void icmpv6_ndo_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } #endif extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); extern void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason); struct flowi6; struct in6_addr; void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_param_prob_reason(skb, code, pos, SKB_DROP_REASON_NOT_SPECIFIED); } static inline bool icmpv6_is_err(int type) { switch (type) { case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: return true; } return false; } #endif
493 3076 198 17 3043 89 54 115 46 345 1 1951 104 3041 503 2528 2618 2425 159 374 3196 3042 2629 3045 2620 108 2537 2585 111 3047 3046 863 2436 2435 2437 2435 1756 544 2427 2425 1141 1139 32 105 11 11 2 1136 21 88 202 202 1565 88 474 62 472 16 10 3 3 427 167 4 167 16 167 268 6 475 51 48 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H #include <linux/err.h> #include <linux/errno.h> #include <linux/jhash.h> #include <linux/list_nulls.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <linux/bit_spinlock.h> #include <linux/rhashtable-types.h> /* * Objects in an rhashtable have an embedded struct rhash_head * which is linked into as hash chain from the hash table - or one * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ struct rhash_lock_head {}; /* Maximum chain length before rehash * * The maximum (not average) chain length grows with the size of the hash * table, at a rate of (log N)/(log log N). * * The value of 16 is selected so that even if the hash table grew to * 2^32 you would not expect the maximum chain length to exceed it * unless we are under attack (or extremely unlucky). * * As this limit is only to detect attacks, we don't need to set it to a * lower value as you'd need the chain length to vastly exceed 16 to have * any real effect on the system. */ #define RHT_ELASTICITY 16u /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; struct lockdep_map dep_map; struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. * * We never store the NULLS_MARKER in the hash table * itself as we need the lsb for locking. * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (char *)he - ht->p.head_offset; } static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else hash = jhash(key, key_len, hash_rnd); } return hash; } static inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); return rht_bucket_index(tbl, hash); } static inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { const char *ptr = rht_obj(ht, he); return likely(params.obj_hashfn) ? rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: ht->p.key_len, tbl->hash_rnd)) : rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } /** * rht_grow_above_75 - returns true if nelems > 0.75 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && tbl->size > ht->p.min_size; } /** * rht_grow_above_100 - returns true if nelems > table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) > tbl->size && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_grow_above_max - returns true if table is above maximum * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) >= ht->max_elems; } #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { return 1; } #endif /* CONFIG_PROVE_LOCKING */ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); static inline void rhashtable_walk_start(struct rhashtable_iter *iter) { (void)rhashtable_walk_start_check(iter); } void *rhashtable_walk_next(struct rhashtable_iter *iter); void *rhashtable_walk_peek(struct rhashtable_iter *iter); void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg); void rhashtable_destroy(struct rhashtable *ht); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : &tbl->buckets[hash]; } /* * We lock a bucket by setting BIT(0) in the pointer - this is always * zero in real pointers. The NULLS mark is never stored in the bucket, * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case * we cannot get a lock. For remove and replace the bucket cannot be * interesting and doesn't need locking. * For insert we allocate the bucket if this is the last bucket_table, * and then take the lock. * Sometimes we unlock a bucket by writing a new pointer there. In that * case we don't need to unlock, but we do need to reset state such as * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline unsigned long rht_lock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); return flags; } static inline unsigned long rht_lock_nested(struct bucket_table *tbl, struct rhash_lock_head __rcu **bucket, unsigned int subclass) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); return flags; } static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { return (struct rhash_head *) ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* * Where 'bkt' is a bucket and might be locked: * rht_ptr_rcu() dereferences that pointer and clears the lock bit. * rht_ptr() dereferences in a context where the bucket is locked. * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, struct rhash_head *obj, unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_irq_restore(flags); } /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ for (pos = head; \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each - iterate over hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ rht_for_each_entry_from(tpos, pos, \ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @next: the &struct rhash_head to use as next in loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next, \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ rht_ptr_rcu(rht_bucket(tbl, hash)), \ tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * @member: name of the &struct rlist_head within the hashable struct. * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { struct rhashtable *ht = arg->ht; const char *ptr = obj; return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } /* Internal function, do not use. */ static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; return he; } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; return NULL; } /** * rhashtable_lookup - search hash table * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * This must only be called under the RCU read lock. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(ht, key, params); return he ? rht_obj(ht, he) : NULL; } /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * Only use this function when you have other mechanisms guaranteeing * that the object won't go away after the RCU read lock is released. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { void *obj; rcu_read_lock(); obj = rhashtable_lookup(ht, key, params); rcu_read_unlock(); return obj; } /** * rhltable_lookup - search hash list table * @hlt: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. All matching entries are returned * in a list. * * This must only be called under the RCU read lock. * * Returns the list of entries that match the given key. */ static inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes in there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; unsigned long flags; unsigned int hash; int elasticity; void *data; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); elasticity = RHT_ELASTICITY; bkt = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!bkt) goto out; pprev = NULL; flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; elasticity--; if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } data = rht_obj(ht, head); if (!rhlist) goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } if (elasticity <= 0) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; /* Inserting at head of list makes unlocking free. */ head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } atomic_inc(&ht->nelems); rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); data = NULL; out: rcu_read_unlock(); return data; out_unlock: rht_unlock(tbl, bkt, flags); goto out; } /** * rhashtable_insert_fast - insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhltable_insert_key - insert object into hash list table * @hlt: hash list table * @key: the pointer to the key * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, params, true)); } /** * rhltable_insert - insert object into hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { const char *key = rht_obj(&hlt->ht, &list->rhead); key += params.key_offset; return rhltable_insert_key(hlt, key, list, params); } /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); void *ret; BUG_ON(ht->p.obj_hashfn); ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_fast(), but this function returns the * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); BUG_ON(ht->p.obj_hashfn); return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); } /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. * * Returns zero on success. */ static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; BUG_ON(!ht->p.obj_hashfn || !key); ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_key - lookup and insert object into hash table * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_key(), but this function returns the * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); if (he != obj) { struct rhlist_head __rcu **lpprev; pprev = &he->next; if (!rhlist) continue; do { lpprev = &list->next; list = rht_dereference_bucket(list->next, tbl, hash); } while (list && obj != &list->rhead); if (!list) continue; list = rht_dereference_bucket(list->next, tbl, hash); RCU_INIT_POINTER(*lpprev, list); err = 0; break; } obj = rht_dereference_bucket(obj->next, tbl, hash); err = 1; if (rhlist) { list = rht_dereference_bucket(list->next, tbl, hash); if (list) { RCU_INIT_POINTER(list->rhead.next, obj); obj = &list->rhead; err = 0; } } if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) schedule_work(&ht->run_work); err = 0; } return err; } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhashtable_remove_fast - remove object from hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { return __rhashtable_remove_fast(ht, obj, params, false); } /** * rhltable_remove - remove object from hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); } /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; /* Minimally, the old and new objects must have same hash * (which should mean identifiers are the same). */ hash = rht_head_hashfn(ht, tbl, obj_old, params); if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: return err; } /** * rhashtable_replace_fast - replace an object in hash table * @ht: hash table * @obj_old: pointer to hash head inside object being replaced * @obj_new: pointer to hash head inside object which is new * @params: hash table parameters * * Replacing an object doesn't affect the number of elements in the hash table * or bucket, so we don't need to worry about shrinking or expanding the * table here. * * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ static inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhltable_walk_enter - Initialise an iterator * @hlt: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { return rhashtable_walk_enter(&hlt->ht, iter); } /** * rhltable_free_and_destroy - free elements and destroy hash list table * @hlt: the hash list table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * See documentation for rhashtable_free_and_destroy. */ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void (*free_fn)(void *ptr, void *arg), void *arg) { return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { return rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */
3014 3014 3014 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 // SPDX-License-Identifier: GPL-2.0-only #include <linux/virtio.h> #include <linux/spinlock.h> #include <linux/virtio_config.h> #include <linux/virtio_anchor.h> #include <linux/module.h> #include <linux/idr.h> #include <linux/of.h> #include <uapi/linux/virtio_ids.h> /* Unique numbering for virtio devices. */ static DEFINE_IDA(virtio_index_ida); static ssize_t device_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%04x\n", dev->id.device); } static DEVICE_ATTR_RO(device); static ssize_t vendor_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%04x\n", dev->id.vendor); } static DEVICE_ATTR_RO(vendor); static ssize_t status_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev)); } static DEVICE_ATTR_RO(status); static ssize_t modalias_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "virtio:d%08Xv%08X\n", dev->id.device, dev->id.vendor); } static DEVICE_ATTR_RO(modalias); static ssize_t features_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); unsigned int i; ssize_t len = 0; /* We actually represent this as a bitstring, as it could be * arbitrary length in future. */ for (i = 0; i < sizeof(dev->features)*8; i++) len += sysfs_emit_at(buf, len, "%c", __virtio_test_bit(dev, i) ? '1' : '0'); len += sysfs_emit_at(buf, len, "\n"); return len; } static DEVICE_ATTR_RO(features); static struct attribute *virtio_dev_attrs[] = { &dev_attr_device.attr, &dev_attr_vendor.attr, &dev_attr_status.attr, &dev_attr_modalias.attr, &dev_attr_features.attr, NULL, }; ATTRIBUTE_GROUPS(virtio_dev); static inline int virtio_id_match(const struct virtio_device *dev, const struct virtio_device_id *id) { if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) return 0; return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; } /* This looks through all the IDs a driver claims to support. If any of them * match, we return 1 and the kernel will call virtio_dev_probe(). */ static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) { unsigned int i; struct virtio_device *dev = dev_to_virtio(_dv); const struct virtio_device_id *ids; ids = drv_to_virtio(_dr)->id_table; for (i = 0; ids[i].device; i++) if (virtio_id_match(dev, &ids[i])) return 1; return 0; } static int virtio_uevent(const struct device *_dv, struct kobj_uevent_env *env) { const struct virtio_device *dev = dev_to_virtio(_dv); return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", dev->id.device, dev->id.vendor); } void virtio_check_driver_offered_feature(const struct virtio_device *vdev, unsigned int fbit) { unsigned int i; struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); for (i = 0; i < drv->feature_table_size; i++) if (drv->feature_table[i] == fbit) return; if (drv->feature_table_legacy) { for (i = 0; i < drv->feature_table_size_legacy; i++) if (drv->feature_table_legacy[i] == fbit) return; } BUG(); } EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); static void __virtio_config_changed(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); if (!dev->config_enabled) dev->config_change_pending = true; else if (drv && drv->config_changed) drv->config_changed(dev); } void virtio_config_changed(struct virtio_device *dev) { unsigned long flags; spin_lock_irqsave(&dev->config_lock, flags); __virtio_config_changed(dev); spin_unlock_irqrestore(&dev->config_lock, flags); } EXPORT_SYMBOL_GPL(virtio_config_changed); static void virtio_config_disable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_enabled = false; spin_unlock_irq(&dev->config_lock); } static void virtio_config_enable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_enabled = true; if (dev->config_change_pending) __virtio_config_changed(dev); dev->config_change_pending = false; spin_unlock_irq(&dev->config_lock); } void virtio_add_status(struct virtio_device *dev, unsigned int status) { might_sleep(); dev->config->set_status(dev, dev->config->get_status(dev) | status); } EXPORT_SYMBOL_GPL(virtio_add_status); /* Do some validation, then set FEATURES_OK */ static int virtio_features_ok(struct virtio_device *dev) { unsigned int status; might_sleep(); if (virtio_check_mem_acc_cb(dev)) { if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_VERSION_1\n"); return -ENODEV; } if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_ACCESS_PLATFORM\n"); return -ENODEV; } } if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) return 0; virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); status = dev->config->get_status(dev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { dev_err(&dev->dev, "virtio: device refuses features: %x\n", status); return -ENODEV; } return 0; } /** * virtio_reset_device - quiesce device for removal * @dev: the device to reset * * Prevents device from sending interrupts and accessing memory. * * Generally used for cleanup during driver / device removal. * * Once this has been invoked, caller must ensure that * virtqueue_notify / virtqueue_kick are not in progress. * * Note: this guarantees that vq callbacks are not in progress, however caller * is responsible for preventing access from other contexts, such as a system * call/workqueue/bh. Invoking virtio_break_device then flushing any such * contexts is one way to handle that. * */ void virtio_reset_device(struct virtio_device *dev) { #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION /* * The below virtio_synchronize_cbs() guarantees that any * interrupt for this line arriving after * virtio_synchronize_vqs() has completed is guaranteed to see * vq->broken as true. */ virtio_break_device(dev); virtio_synchronize_cbs(dev); #endif dev->config->reset(dev); } EXPORT_SYMBOL_GPL(virtio_reset_device); static int virtio_dev_probe(struct device *_d) { int err, i; struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); u64 device_features; u64 driver_features; u64 driver_features_legacy; /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); /* Figure out what features the device supports. */ device_features = dev->config->get_features(dev); /* Figure out what features the driver supports. */ driver_features = 0; for (i = 0; i < drv->feature_table_size; i++) { unsigned int f = drv->feature_table[i]; BUG_ON(f >= 64); driver_features |= (1ULL << f); } /* Some drivers have a separate feature table for virtio v1.0 */ if (drv->feature_table_legacy) { driver_features_legacy = 0; for (i = 0; i < drv->feature_table_size_legacy; i++) { unsigned int f = drv->feature_table_legacy[i]; BUG_ON(f >= 64); driver_features_legacy |= (1ULL << f); } } else { driver_features_legacy = driver_features; } if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else dev->features = driver_features_legacy & device_features; /* Transport features always preserved to pass to finalize_features. */ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) if (device_features & (1ULL << i)) __virtio_set_bit(dev, i); err = dev->config->finalize_features(dev); if (err) goto err; if (drv->validate) { u64 features = dev->features; err = drv->validate(dev); if (err) goto err; /* Did validation change any features? Then write them again. */ if (features != dev->features) { err = dev->config->finalize_features(dev); if (err) goto err; } } err = virtio_features_ok(dev); if (err) goto err; err = drv->probe(dev); if (err) goto err; /* If probe didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) virtio_device_ready(dev); if (drv->scan) drv->scan(dev); virtio_config_enable(dev); return 0; err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; } static void virtio_dev_remove(struct device *_d) { struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); virtio_config_disable(dev); drv->remove(dev); /* Driver should have reset device. */ WARN_ON_ONCE(dev->config->get_status(dev)); /* Acknowledge the device's existence again. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); of_node_put(dev->dev.of_node); } static struct bus_type virtio_bus = { .name = "virtio", .match = virtio_dev_match, .dev_groups = virtio_dev_groups, .uevent = virtio_uevent, .probe = virtio_dev_probe, .remove = virtio_dev_remove, }; int register_virtio_driver(struct virtio_driver *driver) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; return driver_register(&driver->driver); } EXPORT_SYMBOL_GPL(register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { driver_unregister(&driver->driver); } EXPORT_SYMBOL_GPL(unregister_virtio_driver); static int virtio_device_of_init(struct virtio_device *dev) { struct device_node *np, *pnode = dev_of_node(dev->dev.parent); char compat[] = "virtio,deviceXXXXXXXX"; int ret, count; if (!pnode) return 0; count = of_get_available_child_count(pnode); if (!count) return 0; /* There can be only 1 child node */ if (WARN_ON(count > 1)) return -EINVAL; np = of_get_next_available_child(pnode, NULL); if (WARN_ON(!np)) return -ENODEV; ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device); BUG_ON(ret >= sizeof(compat)); /* * On powerpc/pseries virtio devices are PCI devices so PCI * vendor/device ids play the role of the "compatible" property. * Simply don't init of_node in this case. */ if (!of_device_is_compatible(np, compat)) { ret = 0; goto out; } dev->dev.of_node = np; return 0; out: of_node_put(np); return ret; } /** * register_virtio_device - register virtio device * @dev : virtio device to be registered * * On error, the caller must call put_device on &@dev->dev (and not kfree), * as another code path may have obtained a reference to @dev. * * Returns: 0 on suceess, -error on failure */ int register_virtio_device(struct virtio_device *dev) { int err; dev->dev.bus = &virtio_bus; device_initialize(&dev->dev); /* Assign a unique device index and hence name. */ err = ida_alloc(&virtio_index_ida, GFP_KERNEL); if (err < 0) goto out; dev->index = err; err = dev_set_name(&dev->dev, "virtio%u", dev->index); if (err) goto out_ida_remove; err = virtio_device_of_init(dev); if (err) goto out_ida_remove; spin_lock_init(&dev->config_lock); dev->config_enabled = false; dev->config_change_pending = false; INIT_LIST_HEAD(&dev->vqs); spin_lock_init(&dev->vqs_list_lock); /* We always start by resetting the device, in case a previous * driver messed it up. This also tests that code path a little. */ virtio_reset_device(dev); /* Acknowledge that we've seen the device. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); /* * device_add() causes the bus infrastructure to look for a matching * driver. */ err = device_add(&dev->dev); if (err) goto out_of_node_put; return 0; out_of_node_put: of_node_put(dev->dev.of_node); out_ida_remove: ida_free(&virtio_index_ida, dev->index); out: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; } EXPORT_SYMBOL_GPL(register_virtio_device); bool is_virtio_device(struct device *dev) { return dev->bus == &virtio_bus; } EXPORT_SYMBOL_GPL(is_virtio_device); void unregister_virtio_device(struct virtio_device *dev) { int index = dev->index; /* save for after device release */ device_unregister(&dev->dev); ida_free(&virtio_index_ida, index); } EXPORT_SYMBOL_GPL(unregister_virtio_device); #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); virtio_config_disable(dev); dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; if (drv && drv->freeze) return drv->freeze(dev); return 0; } EXPORT_SYMBOL_GPL(virtio_device_freeze); int virtio_device_restore(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; /* We always start by resetting the device, in case a previous * driver messed it up. */ virtio_reset_device(dev); /* Acknowledge that we've seen the device. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); /* Maybe driver failed before freeze. * Restore the failed status, for debugging. */ if (dev->failed) virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); if (!drv) return 0; /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); ret = dev->config->finalize_features(dev); if (ret) goto err; ret = virtio_features_ok(dev); if (ret) goto err; if (drv->restore) { ret = drv->restore(dev); if (ret) goto err; } /* If restore didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) virtio_device_ready(dev); virtio_config_enable(dev); return 0; err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; } EXPORT_SYMBOL_GPL(virtio_device_restore); #endif static int virtio_init(void) { if (bus_register(&virtio_bus) != 0) panic("virtio bus registration failed"); return 0; } static void __exit virtio_exit(void) { bus_unregister(&virtio_bus); ida_destroy(&virtio_index_ida); } core_initcall(virtio_init); module_exit(virtio_exit); MODULE_LICENSE("GPL");
6205 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KASAN_H #define _LINUX_KASAN_H #include <linux/bug.h> #include <linux/kasan-enabled.h> #include <linux/kernel.h> #include <linux/static_key.h> #include <linux/types.h> struct kmem_cache; struct page; struct slab; struct vm_struct; struct task_struct; #ifdef CONFIG_KASAN #include <linux/linkage.h> #include <asm/kasan.h> #endif typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u) #define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include <linux/pgtable.h> /* Software KASAN implementations use shadow memory. */ #ifdef CONFIG_KASAN_SW_TAGS /* This matches KASAN_TAG_INVALID. */ #define KASAN_SHADOW_INIT 0xFE #else #define KASAN_SHADOW_INIT 0 #endif #ifndef PTE_HWTABLE_PTRS #define PTE_HWTABLE_PTRS 0 #endif extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]; extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD]; extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); #ifndef kasan_mem_to_shadow static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } #endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); /* Enable reporting bugs after kasan_disable_current() */ extern void kasan_enable_current(void); /* Disable reporting bugs for current task */ extern void kasan_disable_current(void); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline int kasan_add_zero_shadow(void *start, unsigned long size) { return 0; } static inline void kasan_remove_zero_shadow(void *start, unsigned long size) {} static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #ifdef CONFIG_KASAN_HW_TAGS #else /* CONFIG_KASAN_HW_TAGS */ #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) { return kasan_hw_tags_enabled(); } #ifdef CONFIG_KASAN void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { if (kasan_enabled()) __kasan_unpoison_range(addr, size); } void __kasan_poison_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) __kasan_poison_pages(page, order, init); } bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) return __kasan_unpoison_pages(page, order, init); return false; } void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { if (kasan_enabled()) __kasan_poison_slab(slab); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { if (kasan_enabled()) __kasan_unpoison_object_data(cache, object); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object); static __always_inline void kasan_poison_object_data(struct kmem_cache *cache, void *object) { if (kasan_enabled()) __kasan_poison_object_data(cache, object); } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object); static __always_inline void * __must_check kasan_init_slab_obj( struct kmem_cache *cache, const void *object) { if (kasan_enabled()) return __kasan_init_slab_obj(cache, object); return (void *)object; } bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip, bool init); static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { if (kasan_enabled()) return __kasan_slab_free(s, object, _RET_IP_, init); return false; } void __kasan_kfree_large(void *ptr, unsigned long ip); static __always_inline void kasan_kfree_large(void *ptr) { if (kasan_enabled()) __kasan_kfree_large(ptr, _RET_IP_); } void __kasan_slab_free_mempool(void *ptr, unsigned long ip); static __always_inline void kasan_slab_free_mempool(void *ptr) { if (kasan_enabled()) __kasan_slab_free_mempool(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( struct kmem_cache *s, void *object, gfp_t flags, bool init) { if (kasan_enabled()) return __kasan_slab_alloc(s, object, flags, init); return object; } void * __must_check __kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags); static __always_inline void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) { if (kasan_enabled()) return __kasan_kmalloc(s, object, size, flags); return (void *)object; } void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); static __always_inline void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { if (kasan_enabled()) return __kasan_kmalloc_large(ptr, size, flags); return (void *)ptr; } void * __must_check __kasan_krealloc(const void *object, size_t new_size, gfp_t flags); static __always_inline void * __must_check kasan_krealloc(const void *object, size_t new_size, gfp_t flags) { if (kasan_enabled()) return __kasan_krealloc(object, new_size, flags); return (void *)object; } /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. */ bool __kasan_check_byte(const void *addr, unsigned long ip); static __always_inline bool kasan_check_byte(const void *addr) { if (kasan_enabled()) return __kasan_check_byte(addr, _RET_IP_); return true; } #else /* CONFIG_KASAN */ static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { return false; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} static inline void kasan_poison_object_data(struct kmem_cache *cache, void *object) {} static inline void *kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { return (void *)object; } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; } static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { return object; } static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) { return (void *)object; } static inline void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { return (void *)ptr; } static inline void *kasan_krealloc(const void *object, size_t new_size, gfp_t flags) { return (void *)object; } static inline bool kasan_check_byte(const void *address) { return true; } #endif /* CONFIG_KASAN */ #if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) void kasan_unpoison_task_stack(struct task_struct *task); #else static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache { int alloc_meta_offset; int free_meta_offset; }; size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ /* Tag-based KASAN modes do not use per-object metadata. */ static inline size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { return 0; } /* And thus nothing prevents cache merging. */ static inline slab_flags_t kasan_never_merge(void) { return 0; } /* And no cache-related metadata initialization is required. */ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) static inline void *kasan_reset_tag(const void *addr) { return (void *)arch_kasan_reset_tag(addr); } /** * kasan_report - print a report about a bad memory access detected by KASAN * @addr: address of the bad access * @size: size of the bad access * @is_write: whether the bad access is a write or a read * @ip: instruction pointer for the accessibility check or the bad access itself */ bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline void *kasan_reset_tag(const void *addr) { return (void *)addr; } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/ #ifdef CONFIG_KASAN_HW_TAGS void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else static inline void kasan_init_sw_tags(void) { } #endif #ifdef CONFIG_KASAN_HW_TAGS void kasan_init_hw_tags_cpu(void); void __init kasan_init_hw_tags(void); #else static inline void kasan_init_hw_tags_cpu(void) { } static inline void kasan_init_hw_tags(void) { } #endif #ifdef CONFIG_KASAN_VMALLOC #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end) { } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags); static __always_inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { if (kasan_enabled()) return __kasan_unpoison_vmalloc(start, size, flags); return (void *)start; } void __kasan_poison_vmalloc(const void *start, unsigned long size); static __always_inline void kasan_poison_vmalloc(const void *start, unsigned long size) { if (kasan_enabled()) __kasan_poison_vmalloc(start, size); } #else /* CONFIG_KASAN_VMALLOC */ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end) { } static inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { return (void *)start; } static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) /* * These functions allocate and free shadow memory for kernel modules. * They are only required when KASAN_VMALLOC is not supported, as otherwise * shadow memory is allocated by the generic vmalloc handlers. */ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ #ifdef CONFIG_KASAN_INLINE void kasan_non_canonical_hook(unsigned long addr); #else /* CONFIG_KASAN_INLINE */ static inline void kasan_non_canonical_hook(unsigned long addr) { } #endif /* CONFIG_KASAN_INLINE */ #endif /* LINUX_KASAN_H */
1057 956 3 1057 1057 1057 1057 1057 1059 1058 114 1058 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
11 11 183 183 183 183 6 6 1 1 284 284 284 206 206 206 168 168 10 10 5 10 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); trace_tcp_cong_state_set(sk, ca_state); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } /* Must be called with rcu lock held */ static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif return ca; } /* Simple linear search, not much in here. */ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (e->key == key) return e; } return NULL; } int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } return 0; } /* Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { pr_notice("%s already registered or non-unique key\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module gets removed entirely. * * A try_module_get() should fail by now as our module is * in "going" state since no refs are held anymore and * module_exit() handler being called. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Replace a registered old ca with a new one. * * The new ca must have the same name as the old one, that has been * registered. */ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); existing = tcp_ca_find_key(old_ca->key); if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { pr_notice("%s not registered or non-unique key\n", ca->name); ret = -EINVAL; } else if (existing != old_ca) { pr_notice("invalid old congestion control algorithm to replace\n"); ret = -EINVAL; } else { /* Add the new one before removing the old one to keep * one implementation available all the time. */ list_add_tail_rcu(&ca->list, &tcp_cong_list); list_del_rcu(&existing->list); pr_debug("%s updated\n", ca->name); } spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module or struct_ops gets removed entirely. */ if (!ret) synchronize_rcu(); return ret; } u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; } rcu_read_unlock(); return key; } char *tcp_ca_get_name_by_key(u32 key, char *buffer) { const struct tcp_congestion_ops *ca; char *ret = NULL; rcu_read_lock(); ca = tcp_ca_find_key(key); if (ca) ret = strncpy(buffer, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); return ret; } /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *prev; int ret; rcu_read_lock(); ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else if (!net_eq(net, &init_net) && !(ca->flags & TCP_CONG_NON_RESTRICTED)) { /* Only init netns can set default to a restricted algorithm */ ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } rcu_read_unlock(); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(&init_net, CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(struct net *net, char *name) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; int err = 0; if (icsk->icsk_ca_dst_locked) return -EPERM; rcu_read_lock(); if (!load) ca = tcp_ca_find(name); else ca = tcp_ca_find_autoload(sock_net(sk), name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; else tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but * something better;) a packet is only considered (s)acked in its entirety to * defend the ACK attacks described in the RFC. Slow start processes a stretch * ACK of degree N as if N acks of degree 1 are received back to back except * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); acked -= cwnd - tcp_snd_cwnd(tp); tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } /* In dangerous area, increase slowly. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, };
1466 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } static __always_inline void cpu_relax(void) { rep_nop(); } struct getcpu_cache; notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */
102 565 6276 60 76 97 113 183 203 370 429 6236 6250 6281 6274 401 1125 137 376 39 130 190 10 10 10 390 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned/packed_struct.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix -- mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitray value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += __get_unaligned_cpu32(k); b += __get_unaligned_cpu32(k + 4); c += __get_unaligned_cpu32(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitray value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */
11069 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_PGTABLE_INVERT_H #define _ASM_PGTABLE_INVERT_H 1 #ifndef __ASSEMBLY__ /* * A clear pte value is special, and doesn't get inverted. * * Note that even users that only pass a pgprot_t (rather * than a full pte) won't trigger the special zero case, * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED * set. So the all zero case really is limited to just the * cleared page table entry case. */ static inline bool __pte_needs_invert(u64 val) { return val && !(val & _PAGE_PRESENT); } /* Get a mask to xor with the page table entry to get the correct pfn. */ static inline u64 protnone_mask(u64 val) { return __pte_needs_invert(val) ? ~0ull : 0; } static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) { /* * When a PTE transitions from NONE to !NONE or vice-versa * invert the PFN part to stop speculation. * pte_pfn undoes this when needed. */ if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) val = (val & ~mask) | (~val & mask); return val; } #endif /* __ASSEMBLY__ */ #endif
15182 6483 4779 4780 4782 4782 4782 4780 4782 4783 4779 4781 4780 4781 4782 4781 4783 6250 6247 6249 6251 6250 6251 6248 6251 6246 6247 6249 6250 6251 6251 6248 6249 6250 84 6219 3083 3083 3081 6215 6217 6217 6217 6072 6072 6072 6072 6072 6071 6214 6225 6225 6225 6225 6225 6225 6225 4781 4780 4638 6160 6222 5411 6222 5412 4783 761 6215 1058 1058 1058 4125 4781 4739 3603 4748 4783 2156 6211 6217 6216 6217 6217 6214 1060 6217 6214 6217 6217 6217 6217 6215 6214 6217 6216 1060 761 5403 12 1058 1058 6212 1058 1060 6216 6215 6216 6214 6215 5094 5096 36 6214 6214 6208 6212 6214 5412 6216 6213 6215 6216 6216 4779 4780 4781 4782 6215 6212 6216 6216 6213 6217 6208 8 1058 1058 4783 4781 4780 4778 4780 4781 4779 4780 4780 4782 4781 4782 4750 2 2 2 8 8 8 8 8 8 8 6204 6203 6191 6205 6205 2037 2025 2035 2035 2037 2037 2035 6204 6202 6201 6200 6209 6204 6202 6203 6202 4631 6203 6203 6204 6203 6204 6204 6205 6202 6220 6212 6219 11 6211 2 2 2 6210 6203 6203 4463 4393 4216 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <linux/io.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/uio.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/vmalloc.h> #include "internal.h" #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { ioremap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; static int __init set_nohugevmalloc(char *str) { vmap_allow_huge = false; return 0; } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } EXPORT_SYMBOL(is_vmalloc_addr); struct vfree_deferred { struct llist_head list; struct work_struct wq; }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry, size); pfn += PFN_DOWN(size); continue; } #endif set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PMD_SHIFT) return 0; if (!arch_vmap_pmd_supported(prot)) return 0; if ((end - addr) != PMD_SIZE) return 0; if (!IS_ALIGNED(addr, PMD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PMD_SIZE)) return 0; if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) return 0; return pmd_set_huge(pmd, phys_addr, prot); } static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PMD_MODIFIED; continue; } if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PUD_SHIFT) return 0; if (!arch_vmap_pud_supported(prot)) return 0; if ((end - addr) != PUD_SIZE) return 0; if (!IS_ALIGNED(addr, PUD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PUD_SIZE)) return 0; if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0; return pud_set_huge(pud, phys_addr, prot); } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PUD_MODIFIED; continue; } if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < P4D_SHIFT) return 0; if (!arch_vmap_p4d_supported(prot)) return 0; if ((end - addr) != P4D_SIZE) return 0; if (!IS_ALIGNED(addr, P4D_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, P4D_SIZE)) return 0; if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) return 0; return p4d_set_huge(p4d, phys_addr, prot); } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_P4D_MODIFIED; continue; } if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; unsigned long next; int err; pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { int err; err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, ioremap_max_page_shift); return err; } static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; } static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); cleared = pmd_clear_huge(pmd); if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); cond_resched(); } while (pmd++, addr = next, addr != end); } static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); cleared = pud_clear_huge(pud); if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); p4d_clear_huge(p4d); if (p4d_bad(*p4d)) *mask |= PGTBL_P4D_MODIFIED; if (p4d_none_or_clear_bad(p4d)) continue; vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /* * vunmap_range_noflush is similar to vunmap_range, but does not * flush caches or TLBs. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). * * This is an internal function only. Do not use outside mm/. */ void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; unsigned long addr = start; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); } void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); __vunmap_range_noflush(start, end); } /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap * @end: end of the VM area to unmap (non-inclusive) * * Clears any present PTEs in the virtual address range, flushes TLBs and * caches. Any subsequent access to the address before it has been re-mapped * is a kernel bug. */ void vunmap_range(unsigned long addr, unsigned long end) { flush_cache_vunmap(addr, end); vunmap_range_noflush(addr, end); flush_tlb_kernel_range(addr, end); } static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; if (WARN_ON(!pfn_valid(page_to_pfn(page)))) return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; pgd_t *pgd; unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return 0; } /* * vmap_pages_range_noflush is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this * function returns successfully and before the addresses are accessed. * * This is an internal function only. Do not use outside mm/. */ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; WARN_ON(page_shift < PAGE_SHIFT); if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) return vmap_small_pages_range_noflush(addr, end, prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use * @pages: pages to map (always PAGE_SIZE pages) * @page_shift: maximum shift that the pages may be mapped with, @pages must * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure. */ static int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); flush_cache_vmap(addr, end); return err; } int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif return is_vmalloc_addr(x); } EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will * return the tail page that corresponds to the base page address, which * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return NULL; /* XXX: no allowance for huge pgd */ if (WARN_ON_ONCE(pgd_bad(*pgd))) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d)) return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(p4d_bad(*p4d))) return NULL; pud = pud_offset(p4d, addr); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud)) return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd)) return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_kernel(pmd, addr); pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); return page; } EXPORT_SYMBOL(vmalloc_to_page); /* * Map a vmalloc()-space virtual address to the physical page frame number. */ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; static LIST_HEAD(purge_vmap_area_list); static DEFINE_SPINLOCK(purge_vmap_area_lock); /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to * make things faster. Especially in "no edge" splitting of * free block. */ static struct kmem_cache *vmap_area_cachep; /* * This linked list is used in pair with free_vmap_area_root. * It gives O(1) access to prev/next to perform fast coalescing. */ static LIST_HEAD(free_vmap_area_list); /* * This augment red-black tree represents the free vmap space. * All vmap_area objects in this tree are sorted by va->va_start * address. It is used for allocation and merging when a vmap * object is released. * * Each vmap_area node contains a maximum available free block * of its sub-tree, right or left. Therefore it is possible to * find a lowest match of free area. */ static struct rb_root free_vmap_area_root = RB_ROOT; /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus * to use more permissive allocation masks. */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); static __always_inline unsigned long va_size(struct vmap_area *va) { return (va->va_end - va->va_start); } static __always_inline unsigned long get_subtree_max_size(struct rb_node *node) { struct vmap_area *va; va = rb_entry_safe(node, struct vmap_area, rb_node); return va ? va->subtree_max_size : 0; } RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void reclaim_and_purge_vmap_areas(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; unsigned long vmalloc_nr_pages(void) { return atomic_long_read(&nr_vmalloc_pages); } /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) { struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end > addr) { va = tmp; if (tmp->va_start <= addr) break; n = n->rb_left; } else n = n->rb_right; } return va; } static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *va; va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr >= va->va_end) n = n->rb_right; else return va; } return NULL; } /* * This function returns back addresses of parent node * and its left or right link for further processing. * * Otherwise NULL is returned. In that case all further * steps regarding inserting of conflicting overlap range * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, struct rb_root *root, struct rb_node *from, struct rb_node **parent) { struct vmap_area *tmp_va; struct rb_node **link; if (root) { link = &root->rb_node; if (unlikely(!*link)) { *parent = NULL; return link; } } else { link = &from; } /* * Go to the bottom of the tree. When we hit the last point * we end up with parent rb_node and correct direction, i name * it link, where the new va->rb_node will be attached to. */ do { tmp_va = rb_entry(*link, struct vmap_area, rb_node); /* * During the traversal we also do some sanity check. * Trigger the BUG() if there are sides(left/right) * or full overlaps. */ if (va->va_end <= tmp_va->va_start) link = &(*link)->rb_left; else if (va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; else { WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); return NULL; } } while (*link); *parent = &tmp_va->rb_node; return link; } static __always_inline struct list_head * get_va_next_sibling(struct rb_node *parent, struct rb_node **link) { struct list_head *list; if (unlikely(!parent)) /* * The red-black tree where we try to find VA neighbors * before merging or inserting is empty, i.e. it means * there is no free vmap space. Normally it does not * happen but we handle this case anyway. */ return NULL; list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list); } static __always_inline void __link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head, bool augment) { /* * VA is still not in the list, but we can * identify its future previous list_head node. */ if (likely(parent)) { head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link) head = head->prev; } /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, * to let __augment_tree_propagate_from() puts everything to * the correct order later on. */ rb_insert_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); va->subtree_max_size = 0; } else { rb_insert_color(&va->rb_node, root); } /* Address-sort this list */ list_add(&va->list, head); } static __always_inline void link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, false); } static __always_inline void link_va_augment(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, true); } static __always_inline void __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else rb_erase(&va->rb_node, root); list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, false); } static __always_inline void unlink_va_augment(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, true); } #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. */ static __always_inline unsigned long compute_subtree_max_size(struct vmap_area *va) { return max3(va_size(va), get_subtree_max_size(va->rb_node.rb_left), get_subtree_max_size(va->rb_node.rb_right)); } static void augment_tree_propagate_check(void) { struct vmap_area *va; unsigned long computed_size; list_for_each_entry(va, &free_vmap_area_list, list) { computed_size = compute_subtree_max_size(va); if (computed_size != va->subtree_max_size) pr_emerg("tree is corrupted: %lu, %lu\n", va_size(va), va->subtree_max_size); } } #endif /* * This function populates subtree_max_size from bottom to upper * levels starting from VA point. The propagation must be done * when VA size is modified by changing its va_start/va_end. Or * in case of newly inserting of VA to the tree. * * It means that __augment_tree_propagate_from() must be called: * - After VA has been inserted to the tree(free path); * - After VA has been shrunk(allocation path); * - After VA has been increased(merging path). * * Please note that, it does not mean that upper parent nodes * and their subtree_max_size are recalculated all the time up * to the root node. * * 4--8 * /\ * / \ * / \ * 2--2 8--8 * * For example if we modify the node 4, shrinking it to 2, then * no any modification is required. If we shrink the node 2 to 1 * its subtree_max_size is updated only, and set to 1. If we shrink * the node 8 to 6, then its subtree_max_size is set to 6 and parent * node becomes 4--6. */ static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { /* * Populate the tree from bottom towards the root until * the calculated maximum available size of checked node * is equal to its current one. */ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK augment_tree_propagate_check(); #endif } static void insert_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); if (link) link_va(va, root, parent, link, head); } static void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; if (from) link = find_va_links(va, NULL, from, &parent); else link = find_va_links(va, root, NULL, &parent); if (link) { link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } /* * Merge de-allocated chunk of VA memory with previous * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. * * Please note, it can return NULL in case of overlap * ranges, followed by WARN() report. Despite it is a * buggy behaviour, a system can be alive and keep * ongoing. */ static __always_inline struct vmap_area * __merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; struct rb_node **link; struct rb_node *parent; bool merged = false; /* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); if (!link) return NULL; /* * Get next node of VA to check if merging can be done. */ next = get_va_next_sibling(parent, link); if (unlikely(next == NULL)) goto insert; /* * start end * | | * |<------VA------>|<-----Next----->| * | | * start end */ if (next != head) { sibling = list_entry(next, struct vmap_area, list); if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } /* * start end * | | * |<-----Prev----->|<------VA------>| * | | * start end */ if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { /* * If both neighbors are coalesced, it is important * to unlink the "next" node first, followed by merging * with "previous" one. Otherwise the tree might not be * fully populated if a sibling's augmented value is * "normalized" because of rotation operations. */ if (merged) __unlink_va(va, root, augment); sibling->va_end = va->va_end; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } insert: if (!merged) __link_va(va, root, parent, link, head, augment); return va; } static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { return __merge_or_add_vmap_area(va, root, head, false); } static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); return va; } static __always_inline bool is_within_this_va(struct vmap_area *va, unsigned long size, unsigned long align, unsigned long vstart) { unsigned long nva_start_addr; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Can be overflowed due to big size or alignment. */ if (nva_start_addr + size < nva_start_addr || nva_start_addr < vstart) return false; return (nva_start_addr + size <= va->va_end); } /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing * parameters. Please note, with an alignment bigger than PAGE_SIZE, * a search length is adjusted to account for worst case alignment * overhead. */ static __always_inline struct vmap_area * find_vmap_lowest_match(struct rb_root *root, unsigned long size, unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { if (is_within_this_va(va, size, align, vstart)) return va; /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is * equal or bigger to the requested search length. */ if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen * due to "vstart" restriction or an alignment overhead * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with * parent's start address adding "1" because we do not want * to enter same sub-tree after it has already been checked * and no suitable free block found there. */ vstart = va->va_start + 1; node = node->rb_right; break; } } } } return NULL; } #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK #include <linux/random.h> static struct vmap_area * find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; return va; } return NULL; } static void find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; unsigned int rnd; get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; va_1 = find_vmap_lowest_match(root, size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", va_1, va_2, vstart); } #endif enum fit_type { NOTHING_FIT = 0, FL_FIT_TYPE = 1, /* full fit */ LE_FIT_TYPE = 2, /* left edge fit */ RE_FIT_TYPE = 3, /* right edge fit */ NE_FIT_TYPE = 4 /* no edge fit */ }; static __always_inline enum fit_type classify_va_fit_type(struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { enum fit_type type; /* Check if it is within VA. */ if (nva_start_addr < va->va_start || nva_start_addr + size > va->va_end) return NOTHING_FIT; /* Now classify. */ if (va->va_start == nva_start_addr) { if (va->va_end == nva_start_addr + size) type = FL_FIT_TYPE; else type = LE_FIT_TYPE; } else if (va->va_end == nva_start_addr + size) { type = RE_FIT_TYPE; } else { type = NE_FIT_TYPE; } return type; } static __always_inline int adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); if (type == FL_FIT_TYPE) { /* * No need to split VA, it fully fits. * * | | * V NVA V * |---------------| */ unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* * Split left edge of fit VA. * * | | * V NVA V R * |-------|-------| */ va->va_start += size; } else if (type == RE_FIT_TYPE) { /* * Split right edge of fit VA. * * | | * L V NVA V * |-------|-------| */ va->va_end = nva_start_addr; } else if (type == NE_FIT_TYPE) { /* * Split no edge of fit VA. * * | | * L V NVA V R * |---|-------|---| */ lva = __this_cpu_xchg(ne_fit_preload_node, NULL); if (unlikely(!lva)) { /* * For percpu allocator we do not do any pre-allocation * and leave it as it is. The reason is it most likely * never ends up with NE_FIT_TYPE splitting. In case of * percpu allocations offsets and sizes are aligned to * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE * are its main fitting cases. * * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. * * Also we can hit this path in case of regular "vmap" * allocations, if "this" current CPU was not preloaded. * See the comment in alloc_vmap_area() why. If so, then * GFP_NOWAIT is used instead to get an extra object for * split purpose. That is rare and most time does not * occur. * * What happens if an allocation gets failed. Basically, * an "overflow" path is triggered to purge lazily freed * areas to free some memory, then, the "retry" path is * triggered to repeat one more time. See more details * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) return -1; } /* * Build the remainder. */ lva->va_start = va->va_start; lva->va_end = nva_start_addr; /* * Shrink this VA to remaining size. */ va->va_start = nva_start_addr + size; } else { return -1; } if (type != FL_FIT_TYPE) { augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; } /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; int ret; /* * Do not adjust when: * a) align <= PAGE_SIZE, because it does not make any sense. * All blocks(their start addresses) are at least PAGE_SIZE * aligned anyway; * b) a short range where a requested size corresponds to exactly * specified [vstart:vend] interval and an alignment > PAGE_SIZE. * With adjusted search length an allocation would not succeed. */ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) return vend; /* Update the free vmap_area. */ ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; } /* * Free a region of KVA allocated by alloc_vmap_area */ static void free_vmap_area(struct vmap_area *va) { /* * Remove from the busy tree/list. */ spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); spin_unlock(&vmap_area_lock); /* * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static inline void preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) { struct vmap_area *va = NULL; /* * Preload this CPU with one extra vmap_area object. It is used * when fit type of free area is NE_FIT_TYPE. It guarantees that * a CPU that does an allocation is preloaded. * * We do it in non-atomic context, thus it allows us to use more * permissive allocation masks to be more stable under low memory * condition and high memory pressure. */ if (!this_cpu_read(ne_fit_preload_node)) va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); spin_lock(lock); if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) kmem_cache_free(vmap_area_cachep, va); } /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask, unsigned long va_flags) { struct vmap_area *va; unsigned long freed; unsigned long addr; int purged = 0; int ret; if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); might_sleep(); gfp_mask = gfp_mask & GFP_RECLAIM_MASK; va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); /* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); ret = kasan_populate_vmalloc(addr, size); if (ret) { free_vmap_area(va); return ERR_PTR(ret); } return va; overflow: if (!purged) { reclaim_and_purge_vmap_areas(); purged = 1; goto retry; } freed = 0; blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); if (freed > 0) { purged = 0; goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } int register_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); int unregister_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems. */ static unsigned long lazy_max_pages(void) { unsigned int log; log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); } static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); /* * Purges all lazily-freed vmap areas. */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; unsigned int num_purged_areas = 0; struct list_head local_purge_list; struct vmap_area *va, *n_va; lockdep_assert_held(&vmap_purge_lock); spin_lock(&purge_vmap_area_lock); purge_vmap_area_root = RB_ROOT; list_replace_init(&purge_vmap_area_list, &local_purge_list); spin_unlock(&purge_vmap_area_lock); if (unlikely(list_empty(&local_purge_list))) goto out; start = min(start, list_first_entry(&local_purge_list, struct vmap_area, list)->va_start); end = max(end, list_last_entry(&local_purge_list, struct vmap_area, list)->va_end); flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock); list_for_each_entry_safe(va, n_va, &local_purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; /* * Finally insert or merge lazily-freed area. It is * detached and there is no need to "unlink" it from * anything. */ va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); if (!va) continue; if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); num_purged_areas++; if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); out: trace_purge_vmap_area_lazy(start, end, num_purged_areas); return num_purged_areas > 0; } /* * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list. */ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { unsigned long nr_lazy; do { mutex_lock(&vmap_purge_lock); __purge_vmap_area_lazy(ULONG_MAX, 0); mutex_unlock(&vmap_purge_lock); /* Recheck if further work is required. */ nr_lazy = atomic_long_read(&vmap_lazy_nr); } while (nr_lazy > lazy_max_pages()); } /* * Free a vmap area, caller ensuring that the area has been unmapped, * unlinked and flush_cache_vunmap had been called for the correct * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) return; nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); /* * Merge or place it to the purge tree/list. */ spin_lock(&purge_vmap_area_lock); merge_or_add_vmap_area(va, &purge_vmap_area_root, &purge_vmap_area_list); spin_unlock(&purge_vmap_area_lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); /* After this point, we may free va at any time */ if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); } struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_area *va; spin_lock(&vmap_area_lock); va = __find_vmap_area(addr, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { struct vmap_area *va; spin_lock(&vmap_area_lock); va = __find_vmap_area(addr, &vmap_area_root); if (va) unlink_va(va, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; } /*** Per cpu kva allocator ***/ /* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU. */ /* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea) */ #if BITS_PER_LONG == 32 #define VMALLOC_SPACE (128UL*1024*1024) #else #define VMALLOC_SPACE (128UL*1024*1024*1024) #endif #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ #define VMAP_BBMAP_BITS \ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) /* * Purge threshold to prevent overeager purging of fragmented blocks for * regular operations: Purge if vb->free is less than 1/4 of the capacity. */ #define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) #define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ #define VMAP_FLAGS_MASK 0x3 struct vmap_block_queue { spinlock_t lock; struct list_head free; /* * An xarray requires an extra memory dynamically to * be allocated. If it is an issue, we can use rb-tree * instead. */ struct xarray vmap_blocks; }; struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* * In order to fast access to any "vmap_block" associated with a * specific address, we use a hash. * * A per-cpu vmap_block_queue is used in both ways, to serialize * an access to free block chains among CPUs(alloc path) and it * also acts as a vmap_block hash(alloc/free paths). It means we * overload it, since we already have the per-cpu array which is * used as a hash table. When used as a hash a 'cpu' passed to * per_cpu() is not actually a CPU but rather a hash index. * * A hash function is addr_to_vb_xa() which hashes any address * to a specific index(in a hash) it belongs to. This then uses a * per_cpu() macro to access an array with generated index. * * An example: * * CPU_1 CPU_2 CPU_0 * | | | * V V V * 0 10 20 30 40 50 60 * |------|------|------|------|------|------|...<vmap address space> * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 * * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; * * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; * * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. * * This technique almost always avoids lock contention on insert/remove, * however xarray spinlocks protect against any contention that remains. */ static struct xarray * addr_to_vb_xa(unsigned long addr) { int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); return &per_cpu(vmap_block_queue, index).vmap_blocks; } /* * We should probably have a fallback mechanism to allocate virtual memory * out of partially filled vmap blocks. However vmap block sizing should be * fairly reasonable according to the vmalloc size, so it shouldn't be a * big problem. */ static unsigned long addr_to_vb_idx(unsigned long addr) { addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); addr /= VMAP_BLOCK_SIZE; return addr; } static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) { unsigned long addr; addr = va_start + (pages_off << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); return (void *)addr; } /** * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this * block. Of course pages number can't exceed VMAP_BBMAP_BITS * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; struct xarray *xa; unsigned long vb_idx; int node, err; void *vaddr; node = numa_node_id(); vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask, VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); err = xa_insert(xa, vb_idx, vb, gfp_mask); if (err) { kfree(vb); free_vmap_area(va); return ERR_PTR(err); } vbq = raw_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); return vaddr; } static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; struct xarray *xa; xa = addr_to_vb_xa(vb->va->va_start); tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); spin_lock(&vmap_area_lock); unlink_va(vb->va, &vmap_area_root); spin_unlock(&vmap_area_lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } static bool purge_fragmented_block(struct vmap_block *vb, struct vmap_block_queue *vbq, struct list_head *purge_list, bool force_purge) { if (vb->free + vb->dirty != VMAP_BBMAP_BITS || vb->dirty == VMAP_BBMAP_BITS) return false; /* Don't overeagerly purge usable blocks unless requested */ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD)) return false; /* prevent further allocs after releasing lock */ WRITE_ONCE(vb->free, 0); /* prevent purging it again */ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS); vb->dirty_min = 0; vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); list_add_tail(&vb->purge, purge_list); return true; } static void free_purged_blocks(struct list_head *purge_list) { struct vmap_block *vb, *n_vb; list_for_each_entry_safe(vb, n_vb, purge_list, purge) { list_del(&vb->purge); free_vmap_block(vb); } } static void purge_fragmented_blocks(int cpu) { LIST_HEAD(purge); struct vmap_block *vb; struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long free = READ_ONCE(vb->free); unsigned long dirty = READ_ONCE(vb->dirty); if (free + dirty != VMAP_BBMAP_BITS || dirty == VMAP_BBMAP_BITS) continue; spin_lock(&vb->lock); purge_fragmented_block(vb, vbq, &purge, true); spin_unlock(&vb->lock); } rcu_read_unlock(); free_purged_blocks(&purge); } static void purge_fragmented_blocks_allcpus(void) { int cpu; for_each_possible_cpu(cpu) purge_fragmented_blocks(cpu); } static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; void *vaddr = NULL; unsigned int order; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* * Allocating 0 bytes isn't what caller wants since * get_order(0) returns funny result. Just warn and terminate * early. */ return NULL; } order = get_order(size); rcu_read_lock(); vbq = raw_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; if (READ_ONCE(vb->free) < (1UL << order)) continue; spin_lock(&vb->lock); if (vb->free < (1UL << order)) { spin_unlock(&vb->lock); continue; } pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); WRITE_ONCE(vb->free, vb->free - (1UL << order)); bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); break; } rcu_read_unlock(); /* Allocate new block if nothing was found */ if (!vaddr) vaddr = new_vmap_block(order, gfp_mask); return vaddr; } static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned int order; struct vmap_block *vb; struct xarray *xa; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap(addr, addr + size); order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; xa = addr_to_vb_xa(addr); vb = xa_load(xa, addr_to_vb_idx(addr)); spin_lock(&vb->lock); bitmap_clear(vb->used_map, offset, (1UL << order)); spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); /* Expand the not yet TLB flushed dirty range */ vb->dirty_min = min(vb->dirty_min, offset); vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order)); if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else spin_unlock(&vb->lock); } static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { LIST_HEAD(purge_list); int cpu; if (unlikely(!vmap_initialized)) return; mutex_lock(&vmap_purge_lock); for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; unsigned long idx; rcu_read_lock(); xa_for_each(&vbq->vmap_blocks, idx, vb) { spin_lock(&vb->lock); /* * Try to purge a fragmented block first. If it's * not purgeable, check whether there is dirty * space to be flushed. */ if (!purge_fragmented_block(vb, vbq, &purge_list, false) && vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; s = va_start + (vb->dirty_min << PAGE_SHIFT); e = va_start + (vb->dirty_max << PAGE_SHIFT); start = min(s, start); end = max(e, end); /* Prevent that this is flushed again */ vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; flush = 1; } spin_unlock(&vb->lock); } rcu_read_unlock(); } free_purged_blocks(&purge_list); if (!__purge_vmap_area_lazy(start, end) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } /** * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer * * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily * to amortize TLB flushing overheads. What this means is that any page you * have now, may, in a former life, have been mapped into kernel virtual * address by the vmap layer and so there might be some CPUs with TLB entries * still referencing that page (additional to the regular 1:1 kernel mapping). * * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can * be sure that none of the pages we have control over will have any aliases * from the vmap layer. */ void vm_unmap_aliases(void) { unsigned long start = ULONG_MAX, end = 0; int flush = 0; _vm_unmap_aliases(start, end, flush); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram * @mem: the pointer returned by vm_map_ram * @count: the count passed to that vm_map_ram call (cannot unmap partial) */ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); kasan_poison_vmalloc(mem, size); if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); vb_free(addr, size); return; } va = find_unlink_vmap_area(addr); if (WARN_ON_ONCE(!va)) return; debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); /** * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life * objects with vm_map_ram(), it could consume lots of address space through * fragmentation (especially on a 32bit machine). You could see failures in * the end. Please use this function for short-lived objects. * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; } if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } /* * Mark the pages as accessible, now that they are mapped. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; static inline unsigned int vm_area_page_order(struct vm_struct *vm) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return vm->page_order; #else return 0; #endif } static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC vm->page_order = order; #else BUG_ON(order != 0); #endif } /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add * * This function is used to add fixed kernel vm area to vmlist before * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags * should contain proper values and the other fields should be zero. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_add_early(struct vm_struct *vm) { struct vm_struct *tmp, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) { BUG_ON(tmp->addr < vm->addr + vm->size); break; } else BUG_ON(tmp->addr + tmp->size > vm->addr); } vm->next = *p; *p = vm; } /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain * proper values on entry and other fields should be zero. On return, * vm->addr contains the allocated address. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { unsigned long addr = ALIGN(VMALLOC_START, align); struct vm_struct *cur, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { if ((unsigned long)cur->addr - addr >= vm->size) break; addr = ALIGN((unsigned long)cur->addr + cur->size, align); } BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; vm->next = *p; *p = vm; kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; struct vmap_area *busy, *free; /* * B F B B B F * -|-----|.....|-----|-----|-----|.....|- * | The KVA space | * |<--------------------------------->| */ list_for_each_entry(busy, &vmap_area_list, list) { if (busy->va_start - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = busy->va_start; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } vmap_start = busy->va_end; } if (vmap_end - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = vmap_end; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } } static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; } static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { spin_lock(&vmap_area_lock); setup_vmalloc_vm_locked(vm, va, flags, caller); spin_unlock(&vmap_area_lock); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; if (flags & VM_IOREMAP) align = 1ul << clamp_t(int, get_count_order_long(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; } setup_vmalloc_vm(area, va, flags, caller); /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, KASAN_VMALLOC_PROT_NORMAL); return area; } struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * find_vm_area - find a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and return it. * It is up to the caller to do all required locking to keep the returned * pointer valid. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (!va) return NULL; return va->vm; } /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; struct vm_struct *vm; might_sleep(); if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", addr)) return NULL; va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); free_unmap_vmap_area(va); return vm; } static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); } /* * Flush the vm mapping and reset the direct map. */ static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_dmap = 0; int i; /* * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { unsigned long page_size; page_size = PAGE_SIZE << page_order; start = min(addr, start); end = max(addr + page_size, end); flush_dmap = 1; } } /* * Set direct map to something invalid so that it won't be cached if * there are any accesses after the TLB flush, then flush the TLB and * reset the direct map permissions to the default. */ set_area_direct_map(area, set_direct_map_invalid_noflush); _vm_unmap_aliases(start, end, flush_dmap); set_area_direct_map(area, set_direct_map_default_noflush); } static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) vfree(llnode); } /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address * * This one is just like vfree() but can be called in any atomic context * except NMIs. */ void vfree_atomic(const void *addr) { struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); BUG_ON(in_nmi()); kmemleak_free(addr); /* * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to * another cpu's list. schedule_work() should be fine with this too. */ if (addr && llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address * * Free the virtually continuous memory area starting at @addr, as obtained * from one of the vmalloc() family of APIs. This will usually also free the * physical memory underlying the virtual allocation, but that memory is * reference counted, so it will not be freed until the last user goes away. * * If @addr is NULL, no operation is performed. * * Context: * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { struct vm_struct *vm; int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); return; } BUG_ON(in_nmi()); kmemleak_free(addr); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ __free_page(page); cond_resched(); } atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, * which was created from the page array passed to vmap(). * * Must not be called in interrupt context. */ void vunmap(const void *addr) { struct vm_struct *vm; BUG_ON(in_interrupt()); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", addr); return; } kfree(vm); } EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags * @prot: page protection for the mapping * * Maps @count pages from @pages into contiguous kernel virtual space. * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself * (which must be kmalloc or vmalloc memory) and one reference per pages in it * are transferred from the caller to vmap(), and will be freed / dropped when * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) return NULL; /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. */ if (WARN_ON_ONCE(flags & VM_NO_GUARD)) flags &= ~VM_NO_GUARD; if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; area = get_vm_area_caller(size, flags, __builtin_return_address(0)); if (!area) return NULL; addr = (unsigned long)area->addr; if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; area->nr_pages = count; } return area->addr; } EXPORT_SYMBOL(vmap); #ifdef CONFIG_VMAP_PFN struct vmap_pfn_data { unsigned long *pfns; pgprot_t prot; unsigned int idx; }; static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; unsigned long pfn = data->pfns[data->idx]; pte_t ptent; if (WARN_ON_ONCE(pfn_valid(pfn))) return -EINVAL; ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); set_pte_at(&init_mm, addr, pte, ptent); data->idx++; return 0; } /** * vmap_pfn - map an array of PFNs into virtually contiguous space * @pfns: array of PFNs * @count: number of pages to map * @prot: page protection for the mapping * * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns * the start address of the mapping. */ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) { struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; struct vm_struct *area; area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, __builtin_return_address(0)); if (!area) return NULL; if (apply_to_page_range(&init_mm, (unsigned long)area->addr, count * PAGE_SIZE, vmap_pfn_apply, &data)) { free_vm_area(area); return NULL; } flush_cache_vmap((unsigned long)area->addr, (unsigned long)area->addr + count * PAGE_SIZE); return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; gfp_t alloc_gfp = gfp; bool nofail = false; struct page *page; int i; /* * For order-0 pages we make use of bulk allocator, if * the page array is partly or not at all populated due * to fails, fallback to a single page allocator that is * more permissive. */ if (!order) { /* bulk allocator doesn't support nofail req. officially */ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; /* * A maximum allowed request is hard-coded and is 100 * pages per call. That is done in order to prevent a * long preemption off scenario in the bulk-allocator * so the range is [1:100]. */ nr_pages_request = min(100U, nr_pages - nr_allocated); /* memory allocation should consider mempolicy, we can't * wrongly use nearest node when nid == NUMA_NO_NODE, * otherwise memory may be allocated in only one node, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); nr_allocated += nr; cond_resched(); /* * If zero or pages were obtained partly, * fallback to a single page allocator. */ if (nr != nr_pages_request) break; } } else if (gfp & __GFP_NOFAIL) { /* * Higher order nofail allocations are really expensive and * potentially dangerous (pre-mature OOM, disruptive reclaim * and compaction etc. */ alloc_gfp &= ~__GFP_NOFAIL; nofail = true; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { if (fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) page = alloc_pages(alloc_gfp, order); else page = alloc_pages_node(nid, alloc_gfp, order); if (unlikely(!page)) { if (!nofail) break; /* fall back to the zero order allocations */ alloc_gfp |= __GFP_NOFAIL; order = 0; continue; } /* * Higher order allocations must be able to be treated as * indepdenent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. */ if (order) split_page(page, order); /* * Careful, we allocate and map page-order pages, but * tracking is done per PAGE_SIZE page so as to keep the * vm_struct APIs independent of the physical/mapped size. */ for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; cond_resched(); nr_allocated += 1U << order; } return nr_allocated; } static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); } if (!area->pages) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); return NULL; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { int i; for (i = 0; i < area->nr_pages; i++) mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { /* * vm_area_alloc_pages() can fail due to insufficient memory but * also:- * * - a pending fatal signal * - insufficient huge page-order pages * * Since we always retry allocations at order-0 in the huge page * case a warning for either is spurious. */ if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", area->nr_pages * PAGE_SIZE); goto fail; } /* * page tables allocations ignore external gfp mask, enforce it * by the scope API */ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; } return area->addr; fail: vfree(area->addr); return NULL; } /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all * supported. * Zone modifiers are not supported. From the reclaim modifiers * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and * __GFP_RETRY_MAYFAIL are not supported). * * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *ret; kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; if (WARN_ON_ONCE(!size)) return NULL; if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", real_size); return NULL; } if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { unsigned long size_per_node; /* * Try huge pages. Only try for PAGE_KERNEL allocations, * others like modules don't yet expect huge pages in * their allocations due to apply_to_page_range not * supporting them. */ size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes(); if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) shift = PMD_SHIFT; else shift = arch_vmap_pte_supported_shift(size_per_node); align = max(real_align, 1UL << shift); size = ALIGN(real_size, 1UL << shift); } again: area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, vm_struct allocation failed%s", real_size, (nofail) ? ". Retrying." : ""); if (nofail) { schedule_timeout_uninterruptible(1); goto again; } goto fail; } /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { if (kasan_hw_tags_enabled()) { /* * Modify protection bits to allow tagging. * This must be done before mapping. */ prot = arch_vmap_pgprot_tagged(prot); /* * Skip page_alloc poisoning and zeroing for physical * pages backing VM_ALLOC mapping. Memory is instead * poisoned and zeroed by kasan_unpoison_vmalloc(). */ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO; } /* Take note that the mapping is PAGE_KERNEL. */ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; /* * Mark the pages as accessible, now that they are mapped. * The condition for setting KASAN_VMALLOC_INIT should complement the * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); return area->addr; fail: if (shift > PAGE_SHIFT) { shift = PAGE_SHIFT; align = real_align; size = real_size; goto again; } return NULL; } /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted * with mm people. * * Return: pointer to the allocated memory or %NULL on error */ void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE EXPORT_SYMBOL_GPL(__vmalloc_node); #endif void *__vmalloc(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); /** * vmalloc - allocate virtually contiguous memory * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * If @size is greater than or equal to PMD_SIZE, allow using * huge pages for the memory * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(vmalloc_huge); /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); /** * vmalloc_node - allocate memory on a specific node * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); /** * vzalloc_node - allocate memory on a specific node with zero fill * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else /* * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32(unsigned long size) { return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); /* * Atomically zero bytes in the iterator. * * Returns the number of zeroed bytes. */ static size_t zero_iter(struct iov_iter *iter, size_t count) { size_t remains = count; while (remains > 0) { size_t num, copied; num = min_t(size_t, remains, PAGE_SIZE); copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); remains -= copied; if (copied < num) break; } return count - remains; } /* * small helper routine, copy contents to iter from addr. * If the page is not present, fill zero. * * Returns the number of copied bytes. */ static size_t aligned_vread_iter(struct iov_iter *iter, const char *addr, size_t count) { size_t remains = count; struct page *page; while (remains > 0) { unsigned long offset, length; size_t copied = 0; offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > remains) length = remains; page = vmalloc_to_page(addr); /* * To do safe access to this _mapped_ area, we need lock. But * adding lock here means that we need to add overhead of * vmalloc()/vfree() calls for this _debug_ interface, rarely * used. Instead of that, we'll use an local mapping via * copy_page_to_iter_nofault() and accept a small overhead in * this access function. */ if (page) copied = copy_page_to_iter_nofault(page, offset, length, iter); else copied = zero_iter(iter, length); addr += copied; remains -= copied; if (copied != length) break; } return count - remains; } /* * Read from a vm_map_ram region of memory. * * Returns the number of copied bytes. */ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, size_t count, unsigned long flags) { char *start; struct vmap_block *vb; struct xarray *xa; unsigned long offset; unsigned int rs, re; size_t remains, n; /* * If it's area created by vm_map_ram() interface directly, but * not further subdividing and delegating management to vmap_block, * handle it here. */ if (!(flags & VMAP_BLOCK)) return aligned_vread_iter(iter, addr, count); remains = count; /* * Area is split into regions and tracked with vmap_block, read out * each region and zero fill the hole between regions. */ xa = addr_to_vb_xa((unsigned long) addr); vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); if (!vb) goto finished_zero; spin_lock(&vb->lock); if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { spin_unlock(&vb->lock); goto finished_zero; } for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { size_t copied; if (remains == 0) goto finished; start = vmap_block_vaddr(vb->va->va_start, rs); if (addr < start) { size_t to_zero = min_t(size_t, start - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } /*it could start reading from the middle of used region*/ offset = offset_in_page(addr); n = ((re - rs + 1) << PAGE_SHIFT) - offset; if (n > remains) n = remains; copied = aligned_vread_iter(iter, start + offset, n); addr += copied; remains -= copied; if (copied != n) goto finished; } spin_unlock(&vb->lock); finished_zero: /* zero-fill the left dirty or free regions */ return count - remains + zero_iter(iter, remains); finished: /* We couldn't copy/zero everything */ spin_unlock(&vb->lock); return count - remains; } /** * vread_iter() - read vmalloc area in a safe way to an iterator. * @iter: the iterator to which data should be written. * @addr: vm address. * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and * copy data from that area to a given buffer. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to * proper area of @buf. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { struct vmap_area *va; struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; addr = kasan_reset_tag(addr); /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; remains = count; spin_lock(&vmap_area_lock); va = find_vmap_area_exceed_addr((unsigned long)addr); if (!va) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; list_for_each_entry_from(va, &vmap_area_list, list) { size_t copied; if (remains == 0) goto finished; vm = va->vm; flags = va->flags & VMAP_FLAGS_MASK; /* * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need * be set together with VMAP_RAM. */ WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) continue; if (vm && (vm->flags & VM_UNINITIALIZED)) continue; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) continue; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } n = vaddr + size - addr; if (n > remains) n = remains; if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); else if (!(vm->flags & VM_IOREMAP)) copied = aligned_vread_iter(iter, addr, n); else /* IOREMAP area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; remains -= copied; if (copied != n) goto finished; } finished_zero: spin_unlock(&vmap_area_lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ spin_unlock(&vmap_area_lock); return count - remains; } /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure * * This function checks that @kaddr is a valid vmalloc'ed area, * and that it is big enough to cover the range starting at * @uaddr in @vma. Will return failure if that criteria isn't * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size) { struct vm_struct *area; unsigned long off; unsigned long end_index; if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) return -EINVAL; size = PAGE_ALIGN(size); if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; if (check_add_overflow(size, off, &end_index) || end_index > get_vm_area_size(area)) return -EINVAL; kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; uaddr += PAGE_SIZE; kaddr += PAGE_SIZE; size -= PAGE_SIZE; } while (size > 0); vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } /** * remap_vmalloc_range - map vmalloc pages to userspace * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map * * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if * that criteria isn't met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; ret = remove_vm_area(area->addr); BUG_ON(ret != area); kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return rb_entry_safe(n, struct vmap_area, rb_node); } /** * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to * @addr: target address * * Returns: vmap_area if it is found. If there is no such area * the first highest(reverse order) vmap_area is returned * i.e. va->va_start < addr && va->va_end < addr or NULL * if there are no any areas before @addr. */ static struct vmap_area * pvm_find_va_enclose_addr(unsigned long addr) { struct vmap_area *va, *tmp; struct rb_node *n; n = free_vmap_area_root.rb_node; va = NULL; while (n) { tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_start <= addr) { va = tmp; if (tmp->va_end >= addr) break; n = n->rb_right; } else { n = n->rb_left; } } return va; } /** * pvm_determine_end_from_reverse - find the highest aligned address * of free block below VMALLOC_END * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ static unsigned long pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; if (likely(*va)) { list_for_each_entry_from_reverse((*va), &free_vmap_area_list, list) { addr = min((*va)->va_end & ~(align - 1), vmalloc_end); if ((*va)->va_start < addr) return addr; } } return 0; } /** * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator * @offsets: array containing offset of each area * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to * be scattered pretty far, distance between two areas easily going up * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans free blocks from the end looking * for matching base. While scanning, if any of the areas do not fit the * base address is pulled down to fit the area. Scanning is repeated till * all the areas fit and then all necessary data structures are inserted * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; /* is everything aligned properly? */ BUG_ON(!IS_ALIGNED(offsets[area], align)); BUG_ON(!IS_ALIGNED(sizes[area], align)); /* detect the area with the highest address */ if (start > offsets[last_area]) last_area = area; for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; if (vmalloc_end - vmalloc_start < last_end) { WARN_ON(true); return NULL; } vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } retry: spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(vmalloc_end); base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { /* * base might have underflowed, add last_end before * comparing. */ if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow; /* * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * If this VA does not fit, move base downwards and recheck. */ if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * This area fits, move on to the previous one. If * the previous one is the terminal one, we're done. */ area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(base + end); } /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { int ret; start = base + offsets[area]; size = sizes[area]; va = pvm_find_va_enclose_addr(start); if (WARN_ON_ONCE(va == NULL)) /* It is a BUG(), but trigger recovery instead. */ goto recovery; ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; /* Allocated area. */ va = vas[area]; va->va_start = start; va->va_end = start + size; } spin_unlock(&free_vmap_area_lock); /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; } /* insert all vm's */ spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); } spin_unlock(&vmap_area_lock); /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; recovery: /* * Remove previously allocated areas. There is no * need in removing these areas from the busy tree, * because they are inserted only on the final step * and when pcpu_get_vm_areas() is success. */ while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); vas[area] = NULL; } overflow: spin_unlock(&free_vmap_area_lock); if (!purged) { reclaim_and_purge_vmap_areas(); purged = true; /* Before "retry", check if we recover. */ for (area = 0; area < nr_vms; area++) { if (vas[area]) continue; vas[area] = kmem_cache_zalloc( vmap_area_cachep, GFP_KERNEL); if (!vas[area]) goto err_free; } goto retry; } err_free: for (area = 0; area < nr_vms; area++) { if (vas[area]) kmem_cache_free(vmap_area_cachep, vas[area]); kfree(vms[area]); } err_free2: kfree(vas); kfree(vms); return NULL; err_free_shadow: spin_lock(&free_vmap_area_lock); /* * We release all the vmalloc shadows, even the ones for regions that * hadn't been successfully added. This relies on kasan_release_vmalloc * being able to tolerate this case. */ for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); vas[area] = NULL; kfree(vms[area]); } spin_unlock(&free_vmap_area_lock); kfree(vas); kfree(vms); return NULL; } /** * pcpu_free_vm_areas - free vmalloc areas for percpu allocator * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() * @nr_vms: the number of allocated areas * * Free vm_structs and the array allocated by pcpu_get_vm_areas(). */ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) { int i; for (i = 0; i < nr_vms; i++) free_vm_area(vms[i]); kfree(vms); } #endif /* CONFIG_SMP */ #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; struct vm_struct *vm; struct vmap_area *va; unsigned long addr; unsigned int nr_pages; if (!spin_trylock(&vmap_area_lock)) return false; va = __find_vmap_area((unsigned long)objp, &vmap_area_root); if (!va) { spin_unlock(&vmap_area_lock); return false; } vm = va->vm; if (!vm) { spin_unlock(&vmap_area_lock); return false; } addr = (unsigned long)vm->addr; caller = vm->caller; nr_pages = vm->nr_pages; spin_unlock(&vmap_area_lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", nr_pages, addr, caller); return true; } #endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock) __acquires(&vmap_area_lock) { mutex_lock(&vmap_purge_lock); spin_lock(&vmap_area_lock); return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) __releases(&vmap_area_lock) __releases(&vmap_purge_lock) { spin_unlock(&vmap_area_lock); mutex_unlock(&vmap_purge_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; if (v->flags & VM_UNINITIALIZED) return; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr += step) counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); } } static void show_purge_info(struct seq_file *m) { struct vmap_area *va; spin_lock(&purge_vmap_area_lock); list_for_each_entry(va, &purge_vmap_area_list, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); } spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) { struct vmap_area *va; struct vm_struct *v; va = list_entry(p, struct vmap_area, list); if (!va->vm) { if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); goto final; } v = va->vm; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) seq_puts(m, " user"); if (v->flags & VM_DMA_COHERENT) seq_puts(m, " dma-coherent"); if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); /* * As a final step, dump "unpurged" areas. */ final: if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); return 0; } static const struct seq_operations vmalloc_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; static int __init proc_vmalloc_init(void) { if (IS_ENABLED(CONFIG_NUMA)) proc_create_seq_private("vmallocinfo", 0400, NULL, &vmalloc_op, nr_node_ids * sizeof(unsigned int), NULL); else proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); return 0; } module_init(proc_vmalloc_init); #endif void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; /* * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); p = &per_cpu(vfree_deferred, i); init_llist_head(&p->list); INIT_WORK(&p->wq, delayed_vfree_work); xa_init(&vbq->vmap_blocks); } /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } /* * Now we can initialize a free vmap space. */ vmap_init_free_space(); vmap_initialized = true; }
5703 1520 3115 3014 577 3142 1 1651 377 3146 3142 3094 3065 1021 3145 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM block #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H #include <linux/blktrace_api.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> #define RWBS_LEN 8 #ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh), TP_STRUCT__entry ( __field( dev_t, dev ) __field( sector_t, sector ) __field( size_t, size ) ), TP_fast_assign( __entry->dev = bh->b_bdev->bd_dev; __entry->sector = bh->b_blocknr; __entry->size = bh->b_size; ), TP_printk("%d,%d sector=%llu size=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->sector, __entry->size ) ); /** * block_touch_buffer - mark a buffer accessed * @bh: buffer_head being touched * * Called from touch_buffer(). */ DEFINE_EVENT(block_buffer, block_touch_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_dirty_buffer - mark a buffer dirty * @bh: buffer_head being dirtied * * Called from mark_buffer_dirty(). */ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); #endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue * @rq: block IO operation request * * The block operation request @rq is being placed back into queue * @q. For some reason the request was not completed and needs to be * put back in the queue. */ TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, 0) ); DECLARE_EVENT_CLASS(block_rq_completion, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int , error ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); /** * block_rq_complete - block IO operation completed by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion * of operation request has been completed by the device driver. If * the @rq->bio is %NULL, then there is absolutely no additional work to * do for the request. If @rq->bio is non-NULL then there is * additional work required to complete the request. */ DEFINE_EVENT(block_rq_completion, block_rq_complete, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); /** * block_rq_error - block IO operation error reported by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_error tracepoint event indicates that some portion * of operation request has failed as reported by the device driver. */ DEFINE_EVENT(block_rq_completion, block_rq_error, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted * into queue @q. The fields in the operation request @rq struct can * be examined to determine which device and sectors the pending * operation would access. */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. */ DEFINE_EVENT(block_rq, block_rq_merge, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_start - insert a request for execution * @rq: block IO operation request * * Called when block operation request @rq is queued for execution */ DEFINE_EVENT(block_rq, block_io_start, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_done - block IO operation request completed * @rq: block IO operation request * * Called when block operation request @rq is completed */ DEFINE_EVENT(block_rq, block_io_done, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), TP_ARGS(q, bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_bio_bounce - used bounce buffer when processing block operation * @bio: block operation * * A bounce buffer was used to handle the block operation @bio in @q. * This occurs when hardware limitations prevent a direct transfer of * data between the @bio data memory area and the IO device. Use of a * bounce buffer requires extra copying of data and decreases * performance. */ DEFINE_EVENT(block_bio, block_bio_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_backmerge - merging block operation to the end of an existing operation * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations * @bio: pending block IO operation (can be %NULL) * * A request struct has been allocated to handle the block IO operation @bio. */ DEFINE_EVENT(block_bio, block_getrq, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_plug - keep operations requests in request queue * @q: request queue to plug * * Plug the request queue @q. Do not allow block operation requests * to be sent to the device driver. Instead, accumulate requests in * the queue to improve throughput performance of the block device. */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), TP_ARGS(q), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s]", __entry->comm) ); DECLARE_EVENT_CLASS(block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->nr_rq = depth; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); /** * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ DEFINE_EVENT(block_unplug, block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit) ); /** * block_split - split a single bio struct into two bio structs * @bio: block operation being split * @new_sector: The starting sector for the new bio * * The bio request @bio needs to be split into two bio requests. The newly * created @bio request starts at @new_sector. This split may be required due to * hardware limitations such as operation crossing device boundaries in a RAID * system. */ TRACE_EVENT(block_split, TP_PROTO(struct bio *bio, unsigned int new_sector), TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, new_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, (unsigned long long)__entry->new_sector, __entry->comm) ); /** * block_bio_remap - map request for a logical device to the raw device * @bio: revised operation * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the * raw block device. */ TRACE_EVENT(block_bio_remap, TP_PROTO(struct bio *bio, dev_t dev, sector_t from), TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector) ); /** * block_rq_remap - map request for a block operation request * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation * * The block operation request @rq in @q has been remapped. The block * operation request @rq holds the current information and @from hold * the original sector. */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request *rq, dev_t dev, sector_t from), TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector, __entry->nr_bios) ); #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); consume_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); consume_skb(skb); } }
106 498 498 496 498 497 496 496 496 479 480 112 112 112 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * This file contains sctp stream maniuplation primitives and helpers. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) { struct sctp_association *asoc; struct sctp_chunk *ch, *temp; struct sctp_outq *outq; asoc = container_of(stream, struct sctp_association, stream); outq = &asoc->outqueue; list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (sid < outcnt) continue; sctp_sched_dequeue_common(outq, ch); /* No need to call dequeue_done here because * the chunks are not scheduled by now. */ /* Mark as failed send. */ sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(ch); } } static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; sched = sctp_sched_ops_from_stream(stream); sched->free_sid(stream, sid); kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. */ static void sctp_stream_outq_migrate(struct sctp_stream *stream, struct sctp_stream *new, __u16 outcnt) { int i; if (stream->outcnt > outcnt) sctp_stream_shrink_out(stream, outcnt); if (new) { /* Here we actually move the old ext stuff into the new * buffer, because we want to keep it. Then * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { int ret; if (outcnt <= stream->outcnt) goto out; ret = genradix_prealloc(&stream->out, outcnt, gfp); if (ret) return ret; out: stream->outcnt = outcnt; return 0; } static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { int ret; if (incnt <= stream->incnt) goto out; ret = genradix_prealloc(&stream->in, incnt, gfp); if (ret) return ret; out: stream->incnt = incnt; return 0; } int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; handle_in: sctp_stream_interleave_init(stream); if (!incnt) return 0; return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); if (ret) { kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } return ret; } void sctp_stream_free(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } void sctp_stream_clear(struct sctp_stream *stream) { int i; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; stream->in = new->in; stream->outcnt = new->outcnt; stream->incnt = new->incnt; sched->sched_all(stream); new->out.tree.root = NULL; new->in.tree.root = NULL; new->outcnt = 0; new->incnt = 0; } static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); return retval; } static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, __u16 str_nums, __be16 *str_list) { struct sctp_association *asoc; __u16 i; asoc = container_of(stream, struct sctp_association, stream); if (!asoc->outqueue.out_qlen) return true; if (!str_nums) return false; for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); if (SCTP_SO(stream, sid)->ext && !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } return true; } int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { struct sctp_stream *stream = &asoc->stream; __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING; in = params->srs_flags & SCTP_STREAM_RESET_INCOMING; if (!out && !in) goto out; str_nums = params->srs_number_streams; str_list = params->srs_stream_list; if (str_nums) { int param_len = 0; if (out) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->outcnt) goto out; param_len = str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_outreq); } if (in) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->incnt) goto out; param_len += str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_inreq); } if (param_len > SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_reconf_chunk)) goto out; } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { retval = -ENOMEM; goto out; } for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { kfree(nstr_list); retval = -EAGAIN; goto out; } chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); if (!chunk) { retval = -ENOMEM; goto out; } if (out) { if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; if (!out) goto out; if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } asoc->strreset_outstanding = out + in; out: return retval; } int sctp_send_reset_assoc(struct sctp_association *asoc) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u16 i; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) return -ENOPROTOOPT; if (asoc->strreset_outstanding) return -EINPROGRESS; if (!sctp_outq_is_empty(&asoc->outqueue)) return -EAGAIN; chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } asoc->strreset_outstanding = 1; return 0; } int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u32 outcnt, incnt; __u16 out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->sas_outstrms; in = params->sas_instrms; outcnt = stream->outcnt + out; incnt = stream->incnt + in; if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM || (!out && !in)) { retval = -EINVAL; goto out; } if (out) { retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); if (retval) goto out; } chunk = sctp_make_strreset_addstrm(asoc, out, in); if (!chunk) { retval = -ENOMEM; goto out; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; goto out; } asoc->strreset_outstanding = !!out + !!in; out: return retval; } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; struct sctp_reconf_chunk *hdr; union sctp_params param; if (!chunk) return NULL; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { /* sctp_strreset_tsnreq is actually the basic structure * of all stream reconf params, so it's safe to use it * to access request_seq. */ struct sctp_strreset_tsnreq *req = param.v; if ((!resp_seq || req->request_seq == resp_seq) && (!type || type == req->param_hdr.type)) return param.v; } return NULL; } static void sctp_update_strreset_result(struct sctp_association *asoc, __u32 result) { asoc->strreset_result[1] = asoc->strreset_result[0]; asoc->strreset_result[0] = result; } struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __be16 *str_p = NULL; __u32 request_seq; __u16 i, nums; request_seq = ntohl(outreq->request_seq); if (ntohl(outreq->send_reset_at_tsn) > sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; /* Check strreset_enable after inseq inc, as sender cannot tell * the peer doesn't enable strreset after receiving response with * result denied, as well as to keep consistent with bsd. */ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->incnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, outreq->response_seq, SCTP_PARAM_RESET_IN_REQUEST)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } if (nums) for (i = 0; i < nums; i++) SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; else for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_reset_event(asoc, SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_inreq *inreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq; __u16 i, nums; __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { result = SCTP_STRRESET_IN_PROGRESS; asoc->strreset_inseq--; goto err; } chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; if (nums) for (i = 0; i < nums; i++) SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; struct sctp_strreset_tsnreq *tsnreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; __u16 i; request_seq = ntohl(tsnreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } goto err; } if (!sctp_outq_is_empty(&asoc->outqueue)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } /* G4: The same processing as though a FWD-TSN chunk (as defined in * [RFC3758]) with all streams affected and a new cumulative TSN * ACK of the Receiver's Next TSN minus 1 were received MUST be * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); /* G3: The same processing as though a SACK chunk with no gap report * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); /* G2: Compute an appropriate value for the local endpoint's next TSN, * i.e., the next TSN assigned by the receiver of the SSN/TSN reset * chunk. The value SHOULD be the highest TSN sent by the receiver * of the request plus 1. */ next_tsn = asoc->next_tsn; asoc->ctsn_ack_point = next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn, next_tsn, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_tsnresp(asoc, result, request_seq, next_tsn, init_tsn); } struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq, incnt; __u16 in, i; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; in = ntohs(addstrm->number_of_streams); incnt = stream->incnt + in; if (!in || incnt > SCTP_MAX_STREAM) goto out; if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_change_event(asoc, 0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } out = ntohs(addstrm->number_of_streams); outcnt = stream->outcnt + out; if (!out || outcnt > SCTP_MAX_STREAM) goto out; ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); if (ret) goto out; chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); stream->outcnt = outcnt; result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_stream *stream = &asoc->stream; struct sctp_strreset_resp *resp = param.v; struct sctp_transport *t; __u16 i, nums, flags = 0; struct sctp_paramhdr *req; __u32 result; req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0); if (!req) return NULL; result = ntohl(resp->result); if (result != SCTP_STRRESET_PERFORMED) { /* if in progress, do nothing but retransmit */ if (result == SCTP_STRRESET_IN_PROGRESS) return NULL; else if (result == SCTP_STRRESET_DENIED) flags = SCTP_STREAM_RESET_DENIED; else flags = SCTP_STREAM_RESET_FAILED; } if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { sout = SCTP_SO(stream, ntohs(str_p[i])); sout->mid = 0; sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { sout = SCTP_SO(stream, i); sout->mid = 0; sout->mid_uo = 0; } } } flags |= SCTP_STREAM_RESET_OUTGOING_SSN; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) return NULL; inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / sizeof(__u16); flags |= SCTP_STREAM_RESET_INCOMING_SSN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { struct sctp_strreset_resptsn *resptsn; __u32 stsn, rtsn; /* check for resptsn, as sctp_verify_reconf didn't do it*/ if (ntohs(param.p->length) != sizeof(*resptsn)) return NULL; resptsn = (struct sctp_strreset_resptsn *)resp; stsn = ntohl(resptsn->senders_next_tsn); rtsn = ntohl(resptsn->receivers_next_tsn); if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); LIST_HEAD(temp); asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); /* Clean up sacked and abandoned queues only. As the * out_chunk_list may not be empty, splice it to temp, * then get it back after sctp_outq_free is done. */ list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) { struct sctp_strreset_addstrm *addstrm; __u16 number; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); number = stream->outcnt - nums; if (result == SCTP_STRRESET_PERFORMED) { for (i = number; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; } else { sctp_stream_shrink_out(stream, number); stream->outcnt = number; } *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, 0, nums, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) { struct sctp_strreset_addstrm *addstrm; /* if the result is performed, it's impossible for addstrm in * request. */ if (result == SCTP_STRRESET_PERFORMED) return NULL; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, nums, 0, GFP_ATOMIC); } asoc->strreset_outstanding--; asoc->strreset_outseq++; /* remove everything for this reconf request */ if (!asoc->strreset_outstanding) { t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } return NULL; }
2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 // SPDX-License-Identifier: GPL-2.0-or-later /* * CALIPSO - Common Architecture Label IPv6 Security Option * * This is an implementation of the CALIPSO protocol as specified in * RFC 5570. * * Authors: Paul Moore <paul.moore@hp.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> #include <linux/crc-ccitt.h> /* Maximium size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) /* Size of the minimum calipso option including * the two-byte TLV header. */ #define CALIPSO_HDR_LEN (2 + 8) /* Maximium size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) /* Maximium size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ #define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); static LIST_HEAD(calipso_doi_list); /* Label mapping cache */ int calipso_cache_enabled = 1; int calipso_cache_bucketsize = 10; #define CALIPSO_CACHE_BUCKETBITS 7 #define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) #define CALIPSO_CACHE_REORDERLIMIT 10 struct calipso_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct calipso_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct calipso_map_cache_bkt *calipso_cache; static void calipso_cache_invalidate(void); static void calipso_doi_putdef(struct calipso_doi *doi_def); /* Label Mapping Cache Functions */ /** * calipso_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * calipso_map_cache_hash - Hashing function for the CALIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CALIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /** * calipso_cache_init - Initialize the CALIPSO cache * * Description: * Initializes the CALIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init calipso_cache_init(void) { u32 iter; calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, sizeof(struct calipso_map_cache_bkt), GFP_KERNEL); if (!calipso_cache) return -ENOMEM; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_init(&calipso_cache[iter].lock); calipso_cache[iter].size = 0; INIT_LIST_HEAD(&calipso_cache[iter].list); } return 0; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ static void calipso_cache_invalidate(void) { struct calipso_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_bh(&calipso_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &calipso_cache[iter].list, list) { list_del(&entry->list); calipso_cache_entry_free(entry); } calipso_cache[iter].size = 0; spin_unlock_bh(&calipso_cache[iter].lock); } } /** * calipso_cache_check - Check the CALIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int calipso_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct calipso_map_cache_entry *entry; struct calipso_map_cache_entry *prev_entry = NULL; u32 hash; if (!calipso_cache_enabled) return -ENOENT; hash = calipso_map_cache_hash(key, key_len); bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); list_for_each_entry(entry, &calipso_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CALIPSO; if (!prev_entry) { spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CALIPSO_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&calipso_cache[bkt].lock); return -ENOENT; } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. The key stored starts at calipso_ptr + 2, * i.e. the type and length bytes are not stored, this corresponds to * calipso_ptr[1] bytes of data. * */ static int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; u32 bkt; struct calipso_map_cache_entry *entry = NULL; struct calipso_map_cache_entry *old_entry = NULL; u32 calipso_ptr_len; if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) return 0; calipso_ptr_len = calipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = calipso_ptr_len; entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); if (calipso_cache[bkt].size < calipso_cache_bucketsize) { list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache[bkt].size += 1; } else { old_entry = list_entry(calipso_cache[bkt].list.prev, struct calipso_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache_entry_free(old_entry); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; cache_add_failure: if (entry) calipso_cache_entry_free(entry); return ret_val; } /* DOI List Functions */ /** * calipso_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct calipso_doi *calipso_doi_search(u32 doi) { struct calipso_doi *iter; list_for_each_entry_rcu(iter, &calipso_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ static int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CALIPSO_DOI_UNKNOWN) goto doi_add_return; refcount_set(&doi_def->refcount, 1); spin_lock(&calipso_doi_list_lock); if (calipso_doi_search(doi_def->doi)) { spin_unlock(&calipso_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &calipso_doi_list); spin_unlock(&calipso_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CALIPSO_MAP_PASS: type_str = "pass"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " calipso_doi=%u calipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ static void calipso_doi_free(struct calipso_doi *doi_def) { kfree(doi_def); } /** * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void calipso_doi_free_rcu(struct rcu_head *entry) { struct calipso_doi *doi_def; doi_def = container_of(entry, struct calipso_doi, rcu); calipso_doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&calipso_doi_list_lock); doi_def = calipso_doi_search(doi); if (!doi_def) { spin_unlock(&calipso_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&calipso_doi_list_lock); calipso_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " calipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ static struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *doi_def; rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ static void calipso_doi_putdef(struct calipso_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; calipso_cache_invalidate(); call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ static int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct calipso_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /** * calipso_validate - Validate a CALIPSO option * @skb: the packet * @option: the start of the option * * Description: * This routine is called to validate a CALIPSO option. * If the option is valid then %true is returned, otherwise * %false is returned. * * The caller should have already checked that the length of the * option (including the TLV header) is >= 10 and that the catmap * length is consistent with the option length. * * We leave checks on the level and categories to the socket layer. */ bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) { struct calipso_doi *doi_def; bool ret_val; u16 crc, len = option[1] + 2; static const u8 zero[2]; /* The original CRC runs over the option including the TLV header * with the CRC-16 field (at offset 8) zeroed out. */ crc = crc_ccitt(0xffff, option, 8); crc = crc_ccitt(crc, zero, sizeof(zero)); if (len > 10) crc = crc_ccitt(crc, option + 10, len - 10); crc = ~crc; if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) return false; rcu_read_lock(); doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); ret_val = !!doi_def; rcu_read_unlock(); return ret_val; } /** * calipso_map_cat_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CALIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int calipso_map_cat_hton(const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int spot = -1; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_catmap_walk(secattr->attr.mls.cat, spot + 1); if (spot < 0) break; if (spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, spot, 1); if (spot > net_spot_max) net_spot_max = spot; } return (net_spot_max / 32 + 1) * 4; } /** * calipso_map_cat_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CALIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int spot = -1; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_bitmap_walk(net_cat, net_clen_bits, spot + 1, 1); if (spot < 0) { if (spot == -2) return -EFAULT; return 0; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * calipso_pad_write - Writes pad bytes in TLV format * @buf: the buffer * @offset: offset from start of buffer to write padding * @count: number of pad bytes to write * * Description: * Write @count bytes of TLV padding into @buffer starting at offset @offset. * @count should be less than 8 - see RFC 4942. * */ static int calipso_pad_write(unsigned char *buf, unsigned int offset, unsigned int count) { if (WARN_ON_ONCE(count >= 8)) return -EINVAL; switch (count) { case 0: break; case 1: buf[offset] = IPV6_TLV_PAD1; break; default: buf[offset] = IPV6_TLV_PADN; buf[offset + 1] = count - 2; if (count > 2) memset(buf + offset + 2, 0, count - 2); break; } return 0; } /** * calipso_genopt - Generate a CALIPSO option * @buf: the option buffer * @start: offset from which to write * @buf_len: the size of opt_buf * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CALIPSO option using the DOI definition and security attributes * passed to the function. This also generates upto three bytes of leading * padding that ensures that the option is 4n + 2 aligned. It returns the * number of bytes written (including any initial padding). */ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 len, pad; u16 crc; static const unsigned char padding[4] = {2, 1, 0, 3}; unsigned char *calipso; /* CALIPSO has 4n + 2 alignment */ pad = padding[start & 3]; if (buf_len <= start + pad + CALIPSO_HDR_LEN) return -ENOSPC; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; len = CALIPSO_HDR_LEN; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = calipso_map_cat_hton(doi_def, secattr, buf + start + pad + len, buf_len - start - pad - len); if (ret_val < 0) return ret_val; len += ret_val; } calipso_pad_write(buf, start, pad); calipso = buf + start + pad; calipso[0] = IPV6_TLV_CALIPSO; calipso[1] = len - 2; *(__be32 *)(calipso + 2) = htonl(doi_def->doi); calipso[6] = (len - CALIPSO_HDR_LEN) / 4; calipso[7] = secattr->attr.mls.lvl; crc = ~crc_ccitt(0xffff, calipso, len); calipso[8] = crc & 0xff; calipso[9] = (crc >> 8) & 0xff; return pad + len; } /* Hop-by-hop hdr helper functions */ /** * calipso_opt_update - Replaces socket's hop options with a new set * @sk: the socket * @hop: new hop options * * Description: * Replaces @sk's hop options with @hop. @hop may be NULL to leave * the socket with no hop options. * */ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = ipv6_update_options(sk, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_tlv_len - Returns the length of the TLV * @opt: the option header * @offset: offset of the TLV within the header * * Description: * Returns the length of the TLV option at offset @offset within * the option header @opt. Checks that the entire TLV fits inside * the option header, returns a negative value if this is not the case. */ static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) { unsigned char *tlv = (unsigned char *)opt; unsigned int opt_len = ipv6_optlen(opt), tlv_len; if (offset < sizeof(*opt) || offset >= opt_len) return -EINVAL; if (tlv[offset] == IPV6_TLV_PAD1) return 1; if (offset + 1 >= opt_len) return -EINVAL; tlv_len = tlv[offset + 1] + 2; if (offset + tlv_len > opt_len) return -EINVAL; return tlv_len; } /** * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header * @hop: the hop options header * @start: on return holds the offset of any leading padding * @end: on return holds the offset of the first non-pad TLV after CALIPSO * * Description: * Finds the space occupied by a CALIPSO option (including any leading and * trailing padding). * * If a CALIPSO option exists set @start and @end to the * offsets within @hop of the start of padding before the first * CALIPSO option and the end of padding after the first CALIPSO * option. In this case the function returns 0. * * In the absence of a CALIPSO option, @start and @end will be * set to the start and end of any trailing padding in the header. * This is useful when appending a new option, as the caller may want * to overwrite some of this padding. In this case the function will * return -ENOENT. */ static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, unsigned int *end) { int ret_val = -ENOENT, tlv_len; unsigned int opt_len, offset, offset_s = 0, offset_e = 0; unsigned char *opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { tlv_len = calipso_tlv_len(hop, offset); if (tlv_len < 0) return tlv_len; switch (opt[offset]) { case IPV6_TLV_PAD1: case IPV6_TLV_PADN: if (offset_e) offset_e = offset; break; case IPV6_TLV_CALIPSO: ret_val = 0; offset_e = offset; break; default: if (offset_e == 0) offset_s = offset; else goto out; } offset += tlv_len; } out: if (offset_s) *start = offset_s + calipso_tlv_len(hop, offset_s); else *start = sizeof(*hop); if (offset_e) *end = offset_e + calipso_tlv_len(hop, offset_e); else *end = opt_len; return ret_val; } /** * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr * @hop: the original hop options header * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Creates a new hop options header based on @hop with a * CALIPSO option added to it. If @hop already contains a CALIPSO * option this is overwritten, otherwise the new option is appended * after any existing options. If @hop is NULL then the new header * will contain just the CALIPSO option and any needed padding. * */ static struct ipv6_opt_hdr * calipso_opt_insert(struct ipv6_opt_hdr *hop, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { unsigned int start, end, buf_len, pad, hop_len; struct ipv6_opt_hdr *new; int ret_val; if (hop) { hop_len = ipv6_optlen(hop); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ERR_PTR(ret_val); } else { hop_len = 0; start = sizeof(*hop); end = 0; } buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; new = kzalloc(buf_len, GFP_ATOMIC); if (!new) return ERR_PTR(-ENOMEM); if (start > sizeof(*hop)) memcpy(new, hop, start); ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, secattr); if (ret_val < 0) { kfree(new); return ERR_PTR(ret_val); } buf_len = start + ret_val; /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ pad = ((buf_len & 4) + (end & 7)) & 7; calipso_pad_write((unsigned char *)new, buf_len, pad); buf_len += pad; if (end != hop_len) { memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); buf_len += hop_len - end; } new->nexthdr = 0; new->hdrlen = buf_len / 8 - 1; return new; } /** * calipso_opt_del - Removes the CALIPSO option from an option header * @hop: the original header * @new: the new header * * Description: * Creates a new header based on @hop without any CALIPSO option. If @hop * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains * no other non-padding options, it returns zero with @new set to NULL. * Otherwise it returns zero, creates a new header without the CALIPSO * option (and removing as much padding as possible) and returns with * @new set to that header. * */ static int calipso_opt_del(struct ipv6_opt_hdr *hop, struct ipv6_opt_hdr **new) { int ret_val; unsigned int start, end, delta, pad, hop_len; ret_val = calipso_opt_find(hop, &start, &end); if (ret_val) return ret_val; hop_len = ipv6_optlen(hop); if (start == sizeof(*hop) && end == hop_len) { /* There's no other option in the header so return NULL */ *new = NULL; return 0; } delta = (end - start) & ~7; *new = kzalloc(hop_len - delta, GFP_ATOMIC); if (!*new) return -ENOMEM; memcpy(*new, hop, start); (*new)->hdrlen -= delta / 8; pad = (end - start) & 7; calipso_pad_write((unsigned char *)*new, start, pad); if (end != hop_len) memcpy((char *)*new + start + pad, (char *)hop + end, hop_len - end); return 0; } /** * calipso_opt_getattr - Get the security attributes from a memory block * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ static int calipso_opt_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi, len = calipso[1], cat_len = calipso[6] * 4; struct calipso_doi *doi_def; if (cat_len + 8 > len) return -EINVAL; if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(calipso + 2); rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto getattr_return; secattr->attr.mls.lvl = calipso[7]; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (cat_len) { ret_val = calipso_map_cat_ntoh(doi_def, calipso + 10, cat_len, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); goto getattr_return; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } secattr->type = NETLBL_NLTYPE_CALIPSO; getattr_return: rcu_read_unlock(); return ret_val; } /* sock functions. */ /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ static int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; hop = txopts->hopopt; opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { len = calipso_tlv_len(hop, offset); if (len < 0) { ret_val = len; goto done; } switch (opt[offset]) { case IPV6_TLV_CALIPSO: if (len < CALIPSO_HDR_LEN) ret_val = -EINVAL; else ret_val = calipso_opt_getattr(&opt[offset], secattr); goto done; default: offset += len; break; } } done: txopt_put(txopts); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ static int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6_opt_hdr *old, *new; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); old = NULL; if (txopts) old = txopts->hopopt; new = calipso_opt_insert(old, doi_def, secattr); txopt_put(txopts); if (IS_ERR(new)) return PTR_ERR(new); ret_val = calipso_opt_update(sk, new); kfree(new); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; if (calipso_opt_del(txopts->hopopt, &new_hop)) goto done; calipso_opt_update(sk, new_hop); kfree(new_hop); done: txopt_put(txopts); } /* request sock functions. */ /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ static int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { struct ipv6_txoptions *txopts; struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else old = NULL; new = calipso_opt_insert(old, doi_def, secattr); if (IS_ERR(new)) return PTR_ERR(new); txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ static void calipso_req_delattr(struct request_sock *req) { struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *new; struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } } kfree(new); } /* skbuff functions. */ /** * calipso_skbuff_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) { const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); int offset; if (ip6_hdr->nexthdr != NEXTHDR_HOP) return NULL; offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); if (offset >= 0) return (unsigned char *)ip6_hdr + offset; return NULL; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ static int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); if (ip6_hdr->nexthdr == NEXTHDR_HOP) { hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ret_val; } else { start = 0; end = 0; } memset(buf, 0, sizeof(buf)); ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); if (ret_val < 0) return ret_val; new_end = start + ret_val; /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ pad = ((new_end & 4) + (end & 7)) & 7; len_delta = new_end - (int)end + pad; ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); else skb_pull(skb, -len_delta); memmove((char *)ip6_hdr - len_delta, ip6_hdr, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); payload = ntohs(ip6_hdr->payload_len); ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); if (start == 0) { struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; new_hop->nexthdr = ip6_hdr->nexthdr; new_hop->hdrlen = len_delta / 8 - 1; ip6_hdr->nexthdr = NEXTHDR_HOP; } else { hop->hdrlen += len_delta / 8; } memcpy((char *)hop + start, buf + (start & 3), new_end - start); calipso_pad_write((unsigned char *)hop, new_end, pad); return 0; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ static int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *old_hop; u32 old_hop_len, start = 0, end = 0, delta, size, pad; if (!calipso_skbuff_optptr(skb)) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); old_hop_len = ipv6_optlen(old_hop); ret_val = calipso_opt_find(old_hop, &start, &end); if (ret_val) return ret_val; if (start == sizeof(*old_hop) && end == old_hop_len) { /* There's no other option in the header so we delete * the whole thing. */ delta = old_hop_len; size = sizeof(*ip6_hdr); ip6_hdr->nexthdr = old_hop->nexthdr; } else { delta = (end - start) & ~7; if (delta) old_hop->hdrlen -= delta / 8; pad = (end - start) & 7; size = sizeof(*ip6_hdr) + start + pad; calipso_pad_write((unsigned char *)old_hop, start, pad); } if (delta) { skb_pull(skb, delta); memmove((char *)ip6_hdr + delta, ip6_hdr, size); skb_reset_network_header(skb); } return 0; } static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, .doi_remove = calipso_doi_remove, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, .sock_getattr = calipso_sock_getattr, .sock_setattr = calipso_sock_setattr, .sock_delattr = calipso_sock_delattr, .req_setattr = calipso_req_setattr, .req_delattr = calipso_req_delattr, .opt_getattr = calipso_opt_getattr, .skbuff_optptr = calipso_skbuff_optptr, .skbuff_setattr = calipso_skbuff_setattr, .skbuff_delattr = calipso_skbuff_delattr, .cache_invalidate = calipso_cache_invalidate, .cache_add = calipso_cache_add }; /** * calipso_init - Initialize the CALIPSO module * * Description: * Initialize the CALIPSO module and prepare it for use. Returns zero on * success and negative values on failure. * */ int __init calipso_init(void) { int ret_val; ret_val = calipso_cache_init(); if (!ret_val) netlbl_calipso_ops_register(&ops); return ret_val; } void calipso_exit(void) { netlbl_calipso_ops_register(NULL); calipso_cache_invalidate(); kfree(calipso_cache); }
442 442 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2012 Taobao. * Written by Tao Ma <boyu.mt@taobao.com> */ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/namei.h> #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; return 0; } static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; min_offs = EXT4_SB(inode->i_sb)->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize - sizeof(struct ext4_xattr_ibody_header); /* * We need to subtract another sizeof(__u32) since an in-inode xattr * needs an empty 4 bytes to indicate the gap between the xattr entry * and the name/value pair. */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return EXT4_XATTR_SIZE(min_offs - EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - EXT4_XATTR_ROUND - sizeof(__u32)); raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ while (!IS_LAST_ENTRY(entry)) { void *next = EXT4_XATTR_NEXT(entry); if (next >= end) { EXT4_ERROR_INODE(inode, "corrupt xattr in inline inode"); return 0; } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); if (EXT4_I(inode)->i_inline_off) { entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); if (free > EXT4_XATTR_ROUND) free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); else free = 0; out: return free; } /* * Get the maximum size we now can store in an inode. * If we can't find the space for a xattr entry, don't use the space * of the extents since we have no space to indicate the inline data. */ int ext4_get_max_inline_size(struct inode *inode) { int error, max_inline_size; struct ext4_iloc iloc; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, "can't get inode location %lu", inode->i_ino); return 0; } down_read(&EXT4_I(inode)->xattr_sem); max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); up_read(&EXT4_I(inode)->xattr_sem); brelse(iloc.bh); if (!max_inline_size) return 0; return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before * the new inode has been unlocked */ int ext4_find_inline_data_nolock(struct inode *inode) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int error; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; if (!is.s.not_found) { if (is.s.here->e_value_inum) { EXT4_ERROR_INODE(inode, "inline data xattr refers " "to an external xattr inode"); error = -EFSCORRUPTED; goto out; } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); } out: brelse(is.iloc.bh); return error; } static int ext4_read_inline_data(struct inode *inode, void *buffer, unsigned int len, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; int cp_len = 0; struct ext4_inode *raw_inode; if (!len) return 0; BUG_ON(len > EXT4_I(inode)->i_inline_size); cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); len -= cp_len; buffer += cp_len; if (!len) goto out; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); len = min_t(unsigned int, len, (unsigned int)le32_to_cpu(entry->e_value_size)); memcpy(buffer, (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); cp_len += len; out: return cp_len; } /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr * value since it is already handled by ext4_xattr_ibody_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int cp_len = 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); raw_inode = ext4_raw_inode(iloc); buffer += pos; if (pos < EXT4_MIN_INLINE_DATA_SIZE) { cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE - pos : len; memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); len -= cp_len; buffer += cp_len; pos += cp_len; } if (!len) return; pos -= EXT4_MIN_INLINE_DATA_SIZE; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, buffer, len); } static int ext4_create_inline_data(handle_t *handle, struct inode *inode, unsigned len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; len = 0; } /* Insert the xttr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(!is.s.not_found); error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: brelse(is.iloc.bh); return error; } static int ext4_update_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; /* If the old space is ok, write the data directly. */ if (len <= EXT4_I(inode)->i_inline_size) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(is.s.not_found); len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); if (!value) { error = -ENOMEM; goto out; } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; /* Update the xattr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: kfree(value); brelse(is.iloc.bh); return error; } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return -ENOSPC; size = ext4_get_max_inline_size(inode); if (size < len) return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); ext4_write_unlock_xattr(inode, &no_expand); return ret; } static int ext4_destroy_inline_data_nolock(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_xattr_ibody_find is = { .s = { .not_found = 0, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, .value = NULL, .value_len = 0, }; int error; if (!ei->i_inline_off) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); EXT4_I(inode)->i_inline_off = 0; EXT4_I(inode)->i_inline_size = 0; ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; return error; } static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; BUG_ON(!folio_test_locked(folio)); BUG_ON(!ext4_has_inline_data(inode)); BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", inode->i_ino); goto out; } ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); flush_dcache_folio(folio); kunmap_local(kaddr); folio_zero_segment(folio, len, folio_size(folio)); folio_mark_uptodate(folio); brelse(iloc.bh); out: return ret; } int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); return -EAGAIN; } /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ if (!folio->index) ret = ext4_read_inline_folio(inode, folio); else if (!folio_test_uptodate(folio)) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); return ret >= 0 ? 0 : ret; } static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode) { int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct folio *folio = NULL; unsigned from, to; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { /* * clear the flag so that no new write * will trap here again. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } /* We cannot recurse into the filesystem as the transaction is already * started */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_nofolio; } ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { ret = 0; goto out; } from = 0; to = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; if (ext4_should_dioread_nolock(inode)) { ret = __block_write_begin(&folio->page, from, to, ext4_get_block_unwritten); } else ret = __block_write_begin(&folio->page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { folio_unlock(folio); folio_put(folio); folio = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (folio) block_commit_write(&folio->page, from, to); out: if (folio) { folio_unlock(folio); folio_put(folio); } out_nofolio: if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); return ret; } /* * Try to write data in the inode. * If the inode has inline data, check whether the new write can be * in the inode also. If not, create the page the handle, move the data * to the page make it update and let the later codes create extent for it. */ int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) goto convert; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; /* * The possible write could happen in the inode, * so try to reserve the space in inode first. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out; /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); brelse(iloc.bh); goto convert; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out; } *pagep = &folio->page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; folio_unlock(folio); folio_put(folio); goto out_up_read; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); goto out_up_read; } } ret = 1; handle = NULL; out_up_read: up_read(&EXT4_I(inode)->xattr_sem); out: if (handle && (ret != 1)) ext4_journal_stop(handle); brelse(iloc.bh); return ret; convert: return ext4_convert_inline_data_to_extent(mapping, inode); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { folio_unlock(folio); folio_put(folio); ext4_std_error(inode->i_sb, ret); goto out; } ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); /* * ei->i_inline_off may have changed since * ext4_write_begin() called * ext4_try_to_write_inline_data() */ (void) ext4_find_inline_data_nolock(inode); kaddr = kmap_local_folio(folio, 0); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); kunmap_local(kaddr); folio_mark_uptodate(folio); /* clear dirty flag so that writepages wouldn't work for us. */ folio_clear_dirty(folio); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* * It's important to update i_size while still holding folio * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } folio_unlock(folio); folio_put(folio); /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (likely(copied)) mark_inode_dirty(inode); out: /* * If we didn't copy as much data as expected, we need to trim back * size of xattr containing inline data. */ if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: * 1. The inode is created and the first write exceeds inline size. We can * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, void **fsdata) { int ret = 0, inline_size; struct folio *folio; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } inline_size = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } ret = __block_write_begin(&folio->page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); folio_put(folio); ext4_truncate_failed_write(inode); return ret; } folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); if (folio) { folio_unlock(folio); folio_put(folio); } return ret; } /* * Prepare the write for the inline data. * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the * normal ext4_da_write_end). */ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out_journal; if (ret == -ENOSPC) { ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out; } /* * We cannot recurse into the filesystem as the transaction * is already started. */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; goto out_release_page; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out_release_page; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); *pagep = &folio->page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); folio_put(folio); out_journal: ext4_journal_stop(handle); out: brelse(iloc.bh); return ret; } #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) { int offset; unsigned short de_len; struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; trace_printk("inode %lu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, inline_start, inline_size, offset)) BUG(); offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } } #else #define ext4_show_inline_dir(dir, bh, inline_start, inline_size) #endif /* * Add a new entry into a inline dir. * It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { int err; struct ext4_dir_entry_2 *de; err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; } static void *ext4_get_inline_xattr_pos(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; BUG_ON(!EXT4_I(inode)->i_inline_off); header = IHDR(inode, ext4_raw_inode(iloc)); entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + EXT4_I(inode)->i_inline_off); return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); } /* Set the final de to cover the whole block. */ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; int de_len; de = de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; de = de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - old_size, new_size); } else { /* this is just created, so create an empty entry. */ de->inode = 0; de->rec_len = ext4_rec_len_to_disk(new_size, new_size); } } static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, struct ext4_iloc *iloc) { int ret; int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, new_size + EXT4_MIN_INLINE_DATA_SIZE); if (ret) return ret; ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE); dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; return 0; } static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { int ret; ret = ext4_create_inline_data(handle, inode, inline_size); if (ret) { ext4_msg(inode->i_sb, KERN_EMERG, "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", inode->i_ino, ret); return; } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } static int ext4_finish_convert_inline_dir(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, void *buf, int inline_size) { int err, csum_size = 0, header_size = 0; struct ext4_dir_entry_2 *de; void *target = dir_block->b_data; /* * First create "." and ".." and then copy the dir information * back to the block. */ de = target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); header_size = (void *)de - target; memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; i_size_write(inode, inode->i_sb->s_blocksize); EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; ext4_update_final_de(dir_block->b_data, inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, inode->i_sb->s_blocksize - csum_size); if (csum_size) ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; set_buffer_verified(dir_block); return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int error; void *buf = NULL; struct buffer_head *data_bh = NULL; struct ext4_map_blocks map; int inline_size; inline_size = ext4_get_inline_size(inode); buf = kmalloc(inline_size, GFP_NOFS); if (!buf) { error = -ENOMEM; goto out; } error = ext4_read_inline_data(inode, buf, inline_size, iloc); if (error < 0) goto out; /* * Make sure the inline directory entries pass checks before we try to * convert them, so that we avoid touching stuff that needs fsck. */ if (S_ISDIR(inode->i_mode)) { error = ext4_check_all_de(inode, iloc->bh, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (error) goto out; } error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; map.m_lblk = 0; map.m_len = 1; map.m_flags = 0; error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); if (error < 0) goto out_restore; if (!(map.m_flags & EXT4_MAP_MAPPED)) { error = -EIO; goto out_restore; } data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { error = -ENOMEM; goto out_restore; } lock_buffer(data_bh); error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; goto out_restore; } memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { error = ext4_finish_convert_inline_dir(handle, inode, data_bh, buf, inline_size); } out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); out: brelse(data_bh); kfree(buf); return error; } /* * Try to add the new entry to the inline data. * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; ret = ext4_get_inode_loc(dir, &iloc); if (ret) return ret; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; /* check whether it can be inserted to inline xattr space. */ inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; if (!inline_size) { /* Try to use the xattr space.*/ ret = ext4_update_inline_dir(handle, dir, &iloc); if (ret && ret != -ENOSPC) goto out; inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; } /* * The inline space is filled up, so create a new block for it. * As the extent tree will be created, we have to save the inline * dir first. */ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); out: ext4_write_unlock_xattr(dir, &no_expand); ret2 = ext4_mark_inode_dirty(handle, dir); if (unlikely(ret2 && !ret)) ret = ret2; brelse(iloc.bh); return ret; } /* * This function fills a red-black tree with information from an * inlined dir. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data) { int err = 0, count = 0; unsigned int parent_ino; int pos; struct ext4_dir_entry_2 *de; struct inode *inode = file_inode(dir_file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; pos = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); while (pos < inline_size) { /* * As inlined dir doesn't store any information about '.' and * only the inode number of '..' is stored, we have to handle * them differently. */ if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; } else { de = (struct ext4_dir_entry_2 *)(dir_buf + pos); pos += ext4_rec_len_from_disk(de->rec_len, inline_size); if (ext4_check_dir_entry(inode, dir_file, de, iloc.bh, dir_buf, inline_size, pos)) { ret = count; goto out; } } if (ext4_hash_in_dirent(dir)) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { ext4fs_dirhash(dir, de->name, de->name_len, hinfo); } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { ret = err; goto out; } count++; } ret = count; out: kfree(dir_buf); brelse(iloc.bh); return ret; } /* * So this function is called when the volume is mkfsed with * dir_index disabled. In order to keep f_pos persistent * after we convert from an inlined dir to a blocked based, * we just pretend that we are a normal dir and return the * offset as if '.' and '..' really take place. * */ int ext4_read_inline_dir(struct file *file, struct dir_context *ctx, int *has_inline_data) { unsigned int offset, parent_ino; int i; struct ext4_dir_entry_2 *de; struct super_block *sb; struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; ret = 0; sb = inode->i_sb; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = ctx->pos; /* * dotdot_offset and dotdot_size is the real offset and * size for ".." and "." if the dir is block based while * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ dotdot_offset = ext4_dir_rec_len(1, NULL); dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; /* * If the version has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and * ".." is dotdot_offset. */ if (!i) { i = dotdot_offset; continue; } else if (i == dotdot_offset) { i = dotdot_size; continue; } /* for other entry, the real offset in * the buf has to be tuned accordingly. */ de = (struct ext4_dir_entry_2 *) (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); } offset = i; ctx->pos = offset; file->f_version = inode_query_iversion(inode); } while (ctx->pos < extra_size) { if (ctx->pos == 0) { if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; ctx->pos = dotdot_offset; continue; } if (ctx->pos == dotdot_offset) { if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) goto out; ctx->pos = dotdot_size; continue; } de = (struct ext4_dir_entry_2 *) (dir_buf + ctx->pos - extra_offset); if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, extra_size, ctx->pos)) goto out; if (le32_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto out; } ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); brelse(iloc.bh); return ret; } void *ext4_read_inline_link(struct inode *inode) { struct ext4_iloc iloc; int ret, inline_size; void *link; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ERR_PTR(ret); ret = -ENOMEM; inline_size = ext4_get_inline_size(inode); link = kmalloc(inline_size + 1, GFP_NOFS); if (!link) goto out; ret = ext4_read_inline_data(inode, link, inline_size, &iloc); if (ret < 0) { kfree(link); goto out; } nd_terminate_link(link, inode->i_size, ret); out: if (ret < 0) link = ERR_PTR(ret); brelse(iloc.bh); return link; } struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) { struct ext4_iloc iloc; *retval = ext4_get_inode_loc(inode, &iloc); if (*retval) return NULL; *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; return iloc.bh; } /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. * In case of ENOSPC, the caller should create the normal disk layout dir. */ int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode) { int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; struct ext4_iloc iloc; struct ext4_dir_entry_2 *de; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; ret = ext4_prepare_inline_data(handle, inode, inline_size); if (ret) goto out; /* * For inline dir, we only save the inode information for the ".." * and create a fake dentry to cover the left space. */ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; de->inode = cpu_to_le32(parent->i_ino); de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); de->inode = 0; de->rec_len = ext4_rec_len_to_disk( inline_size - EXT4_INLINE_DOTDOT_SIZE, inline_size); set_nlink(inode, 2); inode->i_size = EXT4_I(inode)->i_disksize = inline_size; out: brelse(iloc.bh); return ret; } struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { int ret; struct ext4_iloc iloc; void *inline_start; int inline_size; if (ext4_get_inode_loc(dir, &iloc)) return NULL; down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) goto out; if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: brelse(iloc.bh); iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); return iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data) { int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; err = ext4_get_inode_loc(dir, &iloc); if (err) return err; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < EXT4_MIN_INLINE_DATA_SIZE) { inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; } else { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: ext4_write_unlock_xattr(dir, &no_expand); if (likely(err == 0)) err = ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Get the inline dentry at offset. */ static inline struct ext4_dir_entry_2 * ext4_get_inline_entry(struct inode *inode, struct ext4_iloc *iloc, unsigned int offset, void **inline_start, int *inline_size) { void *inline_pos; BUG_ON(offset > ext4_get_inline_size(inode)); if (offset < EXT4_MIN_INLINE_DATA_SIZE) { inline_pos = (void *)ext4_raw_inode(iloc)->i_block; *inline_size = EXT4_MIN_INLINE_DATA_SIZE; } else { inline_pos = ext4_get_inline_xattr_pos(inode, iloc); offset -= EXT4_MIN_INLINE_DATA_SIZE; *inline_size = ext4_get_inline_size(inode) - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_start) *inline_start = inline_pos; return (struct ext4_dir_entry_2 *)(inline_pos + offset); } bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; ret = true; goto out; } de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); goto out; } inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); goto out; } if (le32_to_cpu(de->inode)) { goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); return ret; } int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret, no_expand; ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); return ret; } int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) { __u64 addr; int error = -EAGAIN; struct ext4_iloc iloc; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) goto out; error = ext4_get_inode_loc(inode, &iloc); if (error) goto out; addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; addr += offsetof(struct ext4_inode, i_block); brelse(iloc.bh); iomap->addr = addr; iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); iomap->type = IOMAP_INLINE; iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); return error; } int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; int inline_size, value_len, needed_blocks, no_expand, err = 0; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; needed_blocks = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; } if ((err = ext4_orphan_add(handle, inode)) != 0) goto out; if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) goto out; down_write(&EXT4_I(inode)->i_data_sem); i_size = inode->i_size; inline_size = ext4_get_inline_size(inode); EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { /* * if there's inline data to truncate and this file was * converted to extents after that inline data was written, * the extent status cache must be cleared to avoid leaving * behind stale delayed allocated extent entries */ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; BUG_ON(is.s.not_found); value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); if (!value) { err = -ENOMEM; goto out_error; } err = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, value_len); if (err <= 0) goto out_error; i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; err = ext4_xattr_ibody_set(handle, inode, &i, &is); if (err) goto out_error; } /* Clear the content within i_blocks. */ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; memset(p + i_size, 0, EXT4_MIN_INLINE_DATA_SIZE - i_size); } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE : i_size; } out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); if (err == 0) { inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_journal_stop(handle); return err; } int ext4_convert_inline_data(struct inode *inode) { int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { /* * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is * cleared. This means we are in the middle of moving of * inline data to delay allocated block. Just force writeout * here to finish conversion. */ error = filemap_flush(inode->i_mapping); if (error) return error; if (!ext4_has_inline_data(inode)) return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; } ext4_write_lock_xattr(inode, &no_expand); if (ext4_has_inline_data(inode)) error = ext4_convert_inline_data_nolock(handle, inode, &iloc); ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); return error; }
2 1 1 2 1 14 10 2 14 2 2 1 2 1 2 8 7 7 6 5 4 3 6 4 2 1 4 11 16 9 8 6 5 5 1 11 11 1 10 5 181 98 84 4 78 180 35 143 180 4 3 2 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 4 3 3 1 3 1 3 2 2 1 2 1 2 1 3 3 2 6 5 2 1 16 3 2 1 1 9 5 4 3 2 5 4 2 1 1 4 3 2 2 3 2 1 3 2 1 1 5 4 3 2 1 1 1 2 18 17 15 5 5 4 4 31 2 1 4 2 2 1 2 1 3 2 1 3 2 1 4 3 2 2 2 26 26 4 3 4 3 2 1 2 1 1 1 179 180 37 181 181 179 178 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, opt); sk_dst_reset(sk); return opt; } static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_MSFILTER: return true; } return false; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) goto e_inval; if (val > 255 || val < -1) goto e_inval; np->hop_limit = val; retv = 0; break; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) break; if (optlen < sizeof(int)) goto e_inval; if (val > 255 || val < -1) goto e_inval; np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); retv = 0; break; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) goto e_inval; if (val != valbool) goto e_inval; np->mc_loop = valbool; retv = 0; break; case IPV6_UNICAST_IF: { struct net_device *dev = NULL; int ifindex; if (optlen != sizeof(int)) goto e_inval; ifindex = (__force int)ntohl((__force __be32)val); if (ifindex == 0) { np->ucast_oif = 0; retv = 0; break; } dev = dev_get_by_index(net, ifindex); retv = -EADDRNOTAVAIL; if (!dev) break; dev_put(dev); retv = -EINVAL; if (sk->sk_bound_dev_if) break; np->ucast_oif = ifindex; retv = 0; break; } case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) break; if (optlen < sizeof(int)) goto e_inval; if (val) { struct net_device *dev; int midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); retv = -ENODEV; break; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val && (!midx || midx != sk->sk_bound_dev_if)) goto e_inval; } np->mcast_oif = val; retv = 0; break; case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) goto e_inval; np->mc_all = valbool; retv = 0; break; case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); break; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) goto e_inval; np->rtalert_isolate = valbool; retv = 0; break; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) goto e_inval; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) goto e_inval; np->pmtudisc = val; retv = 0; break; case IPV6_MTU: if (optlen < sizeof(int)) goto e_inval; if (val && val < IPV6_MIN_MTU) goto e_inval; np->frag_size = val; retv = 0; break; case IPV6_RECVERR: if (optlen < sizeof(int)) goto e_inval; np->recverr = valbool; if (!val) skb_errqueue_purge(&sk->sk_error_queue); retv = 0; break; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) goto e_inval; np->sndflow = valbool; retv = 0; break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) goto e_inval; retv = __ip6_sock_set_addr_preferences(sk, val); break; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) goto e_inval; if (val < 0 || val > 255) goto e_inval; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); retv = 0; break; case IPV6_DONTFRAG: np->dontfrag = valbool; retv = 0; break; case IPV6_AUTOFLOWLABEL: np->autoflowlabel = valbool; np->autoflowlabel_set = 1; retv = 0; break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) goto e_inval; if (val < 0 || val > 1) goto e_inval; np->recverr_rfc4884 = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = np->hop_limit; else val = np->mcast_hops; if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = sock_net(sk)->ipv6.devconf_all->hop_limit; break; } case IPV6_MULTICAST_LOOP: val = np->mc_loop; break; case IPV6_MULTICAST_IF: val = np->mcast_oif; break; case IPV6_MULTICAST_ALL: val = np->mc_all; break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) np->ucast_oif); break; case IPV6_MTU_DISCOVER: val = np->pmtudisc; break; case IPV6_RECVERR: val = np->recverr; break; case IPV6_FLOWINFO_SEND: val = np->sndflow; break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: val = 0; if (np->srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (np->srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; case IPV6_MINHOPCOUNT: val = np->min_hopcount; break; case IPV6_DONTFRAG: val = np->dontfrag; break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), np); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT_ISOLATE: val = np->rtalert_isolate; break; case IPV6_RECVERR_RFC4884: val = np->recverr_rfc4884; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt);
2216 2216 2219 2223 2223 2223 2223 2223 2223 2223 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * is_dx_dir() - check if a directory is using htree indexing * @inode: directory inode * * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; return 0; } static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) { /* Check if . or .. , or skip if namelen is 0 */ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && (de->name[1] == '.' || de->name[1] == '\0')) return true; /* Check if this is a csum entry */ if (de->file_type == EXT4_FT_DIR_CSUM) return true; return false; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); bool has_csum = ext4_has_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - ext4_dir_rec_len(1, has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); err = fscrypt_prepare_readdir(inode); if (err) return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) return err; /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. */ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == 0) { /* m_len should never be zero but let's avoid * an infinite loop if it somehow is */ if (map.m_len == 0) map.m_len = 1; ctx->pos += map.m_len * sb->s_blocksize; continue; } if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto errout; } } if (!bh) { /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirblock_csum_verify(inode, bh)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < ext4_dir_rec_len(1, inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; file->f_version = inode_query_iversion(inode); } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fstr.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, EXT4_DIRENT_HASH(de), EXT4_DIRENT_MINOR_HASH(de), &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) goto errout; if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; } done: err = 0; errout: fscrypt_fname_free_buffer(&fstr); brelse(bh); return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); file->f_version = inode_peek_iversion(inode) - 1; return ret; } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[]; }; /* * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } *root = RB_ROOT; } static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->curr_hash = pos2maj_hash(filp, pos); p->curr_minor_hash = pos2min_hash(filp, pos); return p; } void ext4_htree_free_dir_info(struct dir_private_info *p) { free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; int len; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ len = sizeof(struct fname) + ent_name->len + 1; new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; return 0; } if (new_fn->hash < fname->hash) p = &(*p)->rb_left; else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ p = &(*p)->rb_right; } rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, get_dtype(sb, fname->file_type))) { info->extra_fname = fname; return 1; } fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret = 0; if (!info) { info = ext4_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; file->private_data = info; } if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; info->curr_hash = pos2maj_hash(file, ctx->pos); info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ if (info->extra_fname) { if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ if ((!info->curr_node) || !inode_eq_iversion(inode, file->f_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); file->f_version = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); } fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: info->last_pos = ctx->pos; return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .release = ext4_release_dir, };
1204 1202 1203 1203 1204 1203 2015 848 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/dst.c Protocol independent destination cache. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> #include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } EXPORT_SYMBOL(dst_discard_out); const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ .refcnt = REFCOUNT_INIT(1), }; EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { dst->dev = dev; netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; dst->trailer_len = 0; #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif dst->lwtstate = NULL; rcuref_init(&dst->__rcuref, initial_ref); INIT_LIST_HEAD(&dst->rt_uncached); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } EXPORT_SYMBOL(dst_init); void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; if (ops->gc && !(flags & DST_NOCOUNT) && dst_entries_get_fast(ops) > ops->gc_thresh) ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); return dst; } EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child = NULL; smp_rmb(); #ifdef CONFIG_XFRM if (dst->xfrm) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; child = xdst->child; } #endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); if (dst->ops->destroy) dst->ops->destroy(dst); netdev_put(dst->dev, &dst->dev_tracker); lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced * by dst: * 1. put the dst under blackhole interface and discard all tx/rx packets * on this route. * 2. release the net_device * This function should be called when removing routes from the fib tree * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to * make the next dst_ops->check() fail. */ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) dst_destroy(dst); } EXPORT_SYMBOL(dst_release_immediate); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; refcount_set(&p->refcnt, 1); memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; } else if (prev & DST_METRICS_REFCOUNTED) { if (refcount_dec_and_test(&old_p->refcnt)) kfree(old_p); } } BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); /* Caller asserts that dst_metrics_read_only(dst) is false. */ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); } EXPORT_SYMBOL(__dst_destroy_metrics_generic); struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) { return NULL; } u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } EXPORT_SYMBOL_GPL(dst_blackhole_redirect); unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst->dev->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); static struct dst_ops dst_blackhole_ops = { .family = AF_UNSPEC, .neigh_lookup = dst_blackhole_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static void __metadata_dst_init(struct metadata_dst *md_dst, enum metadata_type type, u8 optslen) { struct dst_entry *dst; dst = &md_dst->dst; dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCOUNT); memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->type = type; } struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags) { struct metadata_dst *md_dst; md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); if (!md_dst) return NULL; __metadata_dst_init(md_dst, type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc); void metadata_dst_free(struct metadata_dst *md_dst) { #ifdef CONFIG_DST_CACHE if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif if (md_dst->type == METADATA_XFRM) dst_release(md_dst->u.xfrm_info.dst_orig); kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) { int cpu; struct metadata_dst __percpu *md_dst; md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; for_each_possible_cpu(cpu) __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); #ifdef CONFIG_DST_CACHE if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); #endif if (one_md_dst->type == METADATA_XFRM) dst_release(one_md_dst->u.xfrm_info.dst_orig); } free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H #include <linux/cache.h> #include <linux/sizes.h> #include <linux/string.h> #include <linux/device.h> #include <linux/err.h> #include <linux/dma-direction.h> #include <linux/scatterlist.h> #include <linux/bug.h> #include <linux/mem_encrypt.h> /** * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/core-api/dma-attributes.rst. */ /* * DMA_ATTR_WEAK_ORDERING: Specifies that reads and writes to the mapping * may be weakly ordered, that is that reads and writes may pass each other. */ #define DMA_ATTR_WEAK_ORDERING (1UL << 1) /* * DMA_ATTR_WRITE_COMBINE: Specifies that writes to the mapping may be * buffered to improve performance. */ #define DMA_ATTR_WRITE_COMBINE (1UL << 2) /* * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel * virtual mapping for the allocated buffer. */ #define DMA_ATTR_NO_KERNEL_MAPPING (1UL << 4) /* * DMA_ATTR_SKIP_CPU_SYNC: Allows platform code to skip synchronization of * the CPU cache for the given buffer assuming that it has been already * transferred to 'device' domain. */ #define DMA_ATTR_SKIP_CPU_SYNC (1UL << 5) /* * DMA_ATTR_FORCE_CONTIGUOUS: Forces contiguous allocation of the buffer * in physical memory. */ #define DMA_ATTR_FORCE_CONTIGUOUS (1UL << 6) /* * DMA_ATTR_ALLOC_SINGLE_PAGES: This is a hint to the DMA-mapping subsystem * that it's probably not worth the time to try to allocate memory to in a way * that gives better TLB efficiency. */ #define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7) /* * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress * allocation failure reports (similarly to __GFP_NOWARN). */ #define DMA_ATTR_NO_WARN (1UL << 8) /* * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully * accessible at an elevated privilege level (and ideally inaccessible or * at least read-only at lesser-privileged levels). */ #define DMA_ATTR_PRIVILEGED (1UL << 9) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a * given device and there may be a translation between the CPU physical address * space and the bus address space. * * DMA_MAPPING_ERROR is the magic error code if a mapping failed. It should not * be used directly in drivers, but checked for using dma_mapping_error() * instead. */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) #ifdef CONFIG_DMA_API_DEBUG void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len); #else static inline void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { } static inline void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len) { } #endif /* CONFIG_DMA_API_DEBUG */ #ifdef CONFIG_HAS_DMA static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { debug_dma_mapping_error(dev, dma_addr); if (unlikely(dma_addr == DMA_MAPPING_ERROR)) return -ENOMEM; return 0; } dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs); void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_can_mmap(struct device *dev); bool dma_pci_p2pdma_supported(struct device *dev); int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); size_t dma_opt_mapping_size(struct device *dev); bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir); void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt); void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } static inline unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { return 0; } static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { } static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { return -EOPNOTSUPP; } static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { } static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { } static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { } static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return -ENOMEM; } static inline void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { return NULL; } static void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { } static inline void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { return NULL; } static inline void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { } static inline int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { return -ENXIO; } static inline int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { return -ENXIO; } static inline bool dma_can_mmap(struct device *dev) { return false; } static inline bool dma_pci_p2pdma_supported(struct device *dev) { return false; } static inline int dma_set_mask(struct device *dev, u64 mask) { return -EIO; } static inline int dma_set_coherent_mask(struct device *dev, u64 mask) { return -EIO; } static inline u64 dma_get_required_mask(struct device *dev) { return 0; } static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } static inline size_t dma_opt_mapping_size(struct device *dev) { return 0; } static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return false; } static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; } static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { return NULL; } static inline void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { } static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt) { return NULL; } static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) { } static inline int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt) { return -EINVAL; } #endif /* CONFIG_HAS_DMA */ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, size_t size, struct page *page); static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); return page ? page_address(page) : NULL; } static inline void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir) { dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); } static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { /* DMA must never operate on areas that might be remapped. */ if (dev_WARN_ONCE(dev, is_vmalloc_addr(ptr), "rejecting DMA map of vmalloc memory\n")) return DMA_MAPPING_ERROR; debug_dma_map_single(dev, ptr, size); return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); } static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { return dma_unmap_page_attrs(dev, addr, size, dir, attrs); } static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { return dma_sync_single_for_cpu(dev, addr + offset, size, dir); } static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { return dma_sync_single_for_device(dev, addr + offset, size, dir); } /** * dma_unmap_sgtable - Unmap the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * @attrs: Optional DMA attributes for the unmap operation * * Unmaps a buffer described by a scatterlist stored in the given sg_table * object for the @dir DMA operation by the @dev device. After this function * the ownership of the buffer is transferred back to the CPU domain. */ static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); } /** * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * * Performs the needed cache synchronization and moves the ownership of the * buffer back to the CPU domain, so it is safe to perform any access to it * by the CPU. Before doing any further DMA operations, one has to transfer * the ownership of the buffer back to the DMA domain by calling the * dma_sync_sgtable_for_device(). */ static inline void dma_sync_sgtable_for_cpu(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir); } /** * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * * Performs the needed cache synchronization and moves the ownership of the * buffer back to the DMA domain, so it is safe to perform the DMA operation. * Once finished, one has to call dma_sync_sgtable_for_cpu() or * dma_unmap_sgtable(). */ static inline void dma_sync_sgtable_for_device(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size); static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return dma_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); } static inline u64 dma_get_mask(struct device *dev) { if (dev->dma_mask && *dev->dma_mask) return *dev->dma_mask; return DMA_BIT_MASK(32); } /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() * as the DMA API guarantees that the coherent DMA mask can be set to * the same or smaller than the streaming DMA mask. */ static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask) { int rc = dma_set_mask(dev, mask); if (rc == 0) dma_set_coherent_mask(dev, mask); return rc; } /* * Similar to the above, except it deals with the case where the device * does not have dev->dma_mask appropriately setup. */ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) { dev->dma_mask = &dev->coherent_dma_mask; return dma_set_mask_and_coherent(dev, mask); } /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check * * Return %true if the devices DMA mask is too small to address all memory in * the system, else %false. Lack of addressing bits is the prime reason for * bounce buffering, but might not be the only one. */ static inline bool dma_addressing_limited(struct device *dev) { return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < dma_get_required_mask(dev); } static inline unsigned int dma_get_max_seg_size(struct device *dev) { if (dev->dma_parms && dev->dma_parms->max_segment_size) return dev->dma_parms->max_segment_size; return SZ_64K; } static inline int dma_set_max_seg_size(struct device *dev, unsigned int size) { if (dev->dma_parms) { dev->dma_parms->max_segment_size = size; return 0; } return -EIO; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) return dev->dma_parms->segment_boundary_mask; return ULONG_MAX; } /** * dma_get_seg_boundary_nr_pages - return the segment boundary in "page" units * @dev: device to guery the boundary for * @page_shift: ilog() of the IOMMU page size * * Return the segment boundary in IOMMU page units (which may be different from * the CPU page size) for the passed in device. * * If @dev is NULL a boundary of U32_MAX is assumed, this case is just for * non-DMA API callers. */ static inline unsigned long dma_get_seg_boundary_nr_pages(struct device *dev, unsigned int page_shift) { if (!dev) return (U32_MAX >> page_shift) + 1; return (dma_get_seg_boundary(dev) >> page_shift) + 1; } static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) { if (dev->dma_parms) { dev->dma_parms->segment_boundary_mask = mask; return 0; } return -EIO; } static inline unsigned int dma_get_min_align_mask(struct device *dev) { if (dev->dma_parms) return dev->dma_parms->min_align_mask; return 0; } static inline int dma_set_min_align_mask(struct device *dev, unsigned int min_align_mask) { if (WARN_ON_ONCE(!dev->dma_parms)) return -EIO; dev->dma_parms->min_align_mask = min_align_mask; return 0; } #ifndef dma_get_cache_alignment static inline int dma_get_cache_alignment(void) { #ifdef ARCH_HAS_DMA_MINALIGN return ARCH_DMA_MINALIGN; #endif return 1; } #endif static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return dmam_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } static inline void *dma_alloc_wc(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { unsigned long attrs = DMA_ATTR_WRITE_COMBINE; if (gfp & __GFP_NOWARN) attrs |= DMA_ATTR_NO_WARN; return dma_alloc_attrs(dev, size, dma_addr, gfp, attrs); } static inline void dma_free_wc(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr) { return dma_free_attrs(dev, size, cpu_addr, dma_addr, DMA_ATTR_WRITE_COMBINE); } static inline int dma_mmap_wc(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size) { return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, DMA_ATTR_WRITE_COMBINE); } #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME #define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) (((PTR)->ADDR_NAME) = (VAL)) #define dma_unmap_len(PTR, LEN_NAME) ((PTR)->LEN_NAME) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) (((PTR)->LEN_NAME) = (VAL)) #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) #define dma_unmap_addr(PTR, ADDR_NAME) (0) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) #define dma_unmap_len(PTR, LEN_NAME) (0) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif #endif /* _LINUX_DMA_MAPPING_H */
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tlb #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TLB_H #include <linux/mm_types.h> #include <linux/tracepoint.h> #define TLB_FLUSH_REASON \ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); TLB_FLUSH_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } TRACE_EVENT(tlb_flush, TP_PROTO(int reason, unsigned long pages), TP_ARGS(reason, pages), TP_STRUCT__entry( __field( int, reason) __field(unsigned long, pages) ), TP_fast_assign( __entry->reason = reason; __entry->pages = pages; ), TP_printk("pages:%ld reason:%s (%d)", __entry->pages, __print_symbolic(__entry->reason, TLB_FLUSH_REASON), __entry->reason) ); #endif /* _TRACE_TLB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaulate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif
1394 1393 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/drivers/char/serial_core.h * * Copyright (C) 2000 Deep Blue Solutions Ltd. */ #ifndef LINUX_SERIAL_CORE_H #define LINUX_SERIAL_CORE_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/console.h> #include <linux/interrupt.h> #include <linux/circ_buf.h> #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/tty.h> #include <linux/mutex.h> #include <linux/sysrq.h> #include <uapi/linux/serial_core.h> #ifdef CONFIG_SERIAL_CORE_CONSOLE #define uart_console(port) \ ((port)->cons && (port)->cons->index == (port)->line) #else #define uart_console(port) ({ (void)port; 0; }) #endif struct uart_port; struct serial_struct; struct serial_port_device; struct device; struct gpio_desc; /** * struct uart_ops -- interface between serial_core and the driver * * This structure describes all the operations that can be done on the * physical hardware. * * @tx_empty: ``unsigned int ()(struct uart_port *port)`` * * This function tests whether the transmitter fifo and shifter for the * @port is empty. If it is empty, this function should return * %TIOCSER_TEMT, otherwise return 0. If the port does not support this * operation, then it should return %TIOCSER_TEMT. * * Locking: none. * Interrupts: caller dependent. * This call must not sleep * * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)`` * * This function sets the modem control lines for @port to the state * described by @mctrl. The relevant bits of @mctrl are: * * - %TIOCM_RTS RTS signal. * - %TIOCM_DTR DTR signal. * - %TIOCM_OUT1 OUT1 signal. * - %TIOCM_OUT2 OUT2 signal. * - %TIOCM_LOOP Set the port into loopback mode. * * If the appropriate bit is set, the signal should be driven * active. If the bit is clear, the signal should be driven * inactive. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @get_mctrl: ``unsigned int ()(struct uart_port *port)`` * * Returns the current state of modem control inputs of @port. The state * of the outputs should not be returned, since the core keeps track of * their state. The state information should include: * * - %TIOCM_CAR state of DCD signal * - %TIOCM_CTS state of CTS signal * - %TIOCM_DSR state of DSR signal * - %TIOCM_RI state of RI signal * * The bit is set if the signal is currently driven active. If * the port does not support CTS, DCD or DSR, the driver should * indicate that the signal is permanently active. If RI is * not available, the signal should not be indicated as active. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @stop_tx: ``void ()(struct uart_port *port)`` * * Stop transmitting characters. This might be due to the CTS line * becoming inactive or the tty layer indicating we want to stop * transmission due to an %XOFF character. * * The driver should stop transmitting characters as soon as possible. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @start_tx: ``void ()(struct uart_port *port)`` * * Start transmitting characters. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @throttle: ``void ()(struct uart_port *port)`` * * Notify the serial driver that input buffers for the line discipline are * close to full, and it should somehow signal that no more characters * should be sent to the serial port. * This will be called only if hardware assisted flow control is enabled. * * Locking: serialized with @unthrottle() and termios modification by the * tty layer. * * @unthrottle: ``void ()(struct uart_port *port)`` * * Notify the serial driver that characters can now be sent to the serial * port without fear of overrunning the input buffers of the line * disciplines. * * This will be called only if hardware assisted flow control is enabled. * * Locking: serialized with @throttle() and termios modification by the * tty layer. * * @send_xchar: ``void ()(struct uart_port *port, char ch)`` * * Transmit a high priority character, even if the port is stopped. This * is used to implement XON/XOFF flow control and tcflow(). If the serial * driver does not implement this function, the tty core will append the * character to the circular buffer and then call start_tx() / stop_tx() * to flush the data out. * * Do not transmit if @ch == '\0' (%__DISABLED_CHAR). * * Locking: none. * Interrupts: caller dependent. * * @start_rx: ``void ()(struct uart_port *port)`` * * Start receiving characters. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @stop_rx: ``void ()(struct uart_port *port)`` * * Stop receiving characters; the @port is in the process of being closed. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @enable_ms: ``void ()(struct uart_port *port)`` * * Enable the modem status interrupts. * * This method may be called multiple times. Modem status interrupts * should be disabled when the @shutdown() method is called. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @break_ctl: ``void ()(struct uart_port *port, int ctl)`` * * Control the transmission of a break signal. If @ctl is nonzero, the * break signal should be transmitted. The signal should be terminated * when another call is made with a zero @ctl. * * Locking: caller holds tty_port->mutex * * @startup: ``int ()(struct uart_port *port)`` * * Grab any interrupt resources and initialise any low level driver state. * Enable the port for reception. It should not activate RTS nor DTR; * this will be done via a separate call to @set_mctrl(). * * This method will only be called when the port is initially opened. * * Locking: port_sem taken. * Interrupts: globally disabled. * * @shutdown: ``void ()(struct uart_port *port)`` * * Disable the @port, disable any break condition that may be in effect, * and free any interrupt resources. It should not disable RTS nor DTR; * this will have already been done via a separate call to @set_mctrl(). * * Drivers must not access @port->state once this call has completed. * * This method will only be called when there are no more users of this * @port. * * Locking: port_sem taken. * Interrupts: caller dependent. * * @flush_buffer: ``void ()(struct uart_port *port)`` * * Flush any write buffers, reset any DMA state and stop any ongoing DMA * transfers. * * This will be called whenever the @port->state->xmit circular buffer is * cleared. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new, * struct ktermios *old)`` * * Change the @port parameters, including word length, parity, stop bits. * Update @port->read_status_mask and @port->ignore_status_mask to * indicate the types of events we are interested in receiving. Relevant * ktermios::c_cflag bits are: * * - %CSIZE - word size * - %CSTOPB - 2 stop bits * - %PARENB - parity enable * - %PARODD - odd parity (when %PARENB is in force) * - %ADDRB - address bit (changed through uart_port::rs485_config()). * - %CREAD - enable reception of characters (if not set, still receive * characters from the port, but throw them away). * - %CRTSCTS - if set, enable CTS status change reporting. * - %CLOCAL - if not set, enable modem status change reporting. * * Relevant ktermios::c_iflag bits are: * * - %INPCK - enable frame and parity error events to be passed to the TTY * layer. * - %BRKINT / %PARMRK - both of these enable break events to be passed to * the TTY layer. * - %IGNPAR - ignore parity and framing errors. * - %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun * errors as well. * * The interaction of the ktermios::c_iflag bits is as follows (parity * error given as an example): * * ============ ======= ======= ========================================= * Parity error INPCK IGNPAR * ============ ======= ======= ========================================= * n/a 0 n/a character received, marked as %TTY_NORMAL * None 1 n/a character received, marked as %TTY_NORMAL * Yes 1 0 character received, marked as %TTY_PARITY * Yes 1 1 character discarded * ============ ======= ======= ========================================= * * Other flags may be used (eg, xon/xoff characters) if your hardware * supports hardware "soft" flow control. * * Locking: caller holds tty_port->mutex * Interrupts: caller dependent. * This call must not sleep * * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)`` * * Notifier for discipline change. See * Documentation/driver-api/tty/tty_ldisc.rst. * * Locking: caller holds tty_port->mutex * * @pm: ``void ()(struct uart_port *port, unsigned int state, * unsigned int oldstate)`` * * Perform any power management related activities on the specified @port. * @state indicates the new state (defined by enum uart_pm_state), * @oldstate indicates the previous state. * * This function should not be used to grab any resources. * * This will be called when the @port is initially opened and finally * closed, except when the @port is also the system console. This will * occur even if %CONFIG_PM is not set. * * Locking: none. * Interrupts: caller dependent. * * @type: ``const char *()(struct uart_port *port)`` * * Return a pointer to a string constant describing the specified @port, * or return %NULL, in which case the string 'unknown' is substituted. * * Locking: none. * Interrupts: caller dependent. * * @release_port: ``void ()(struct uart_port *port)`` * * Release any memory and IO region resources currently in use by the * @port. * * Locking: none. * Interrupts: caller dependent. * * @request_port: ``int ()(struct uart_port *port)`` * * Request any memory and IO region resources required by the port. If any * fail, no resources should be registered when this function returns, and * it should return -%EBUSY on failure. * * Locking: none. * Interrupts: caller dependent. * * @config_port: ``void ()(struct uart_port *port, int type)`` * * Perform any autoconfiguration steps required for the @port. @type * contains a bit mask of the required configuration. %UART_CONFIG_TYPE * indicates that the port requires detection and identification. * @port->type should be set to the type found, or %PORT_UNKNOWN if no * port was detected. * * %UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal, * which should be probed using standard kernel autoprobing techniques. * This is not necessary on platforms where ports have interrupts * internally hard wired (eg, system on a chip implementations). * * Locking: none. * Interrupts: caller dependent. * * @verify_port: ``int ()(struct uart_port *port, * struct serial_struct *serinfo)`` * * Verify the new serial port information contained within @serinfo is * suitable for this port type. * * Locking: none. * Interrupts: caller dependent. * * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd, * unsigned long arg)`` * * Perform any port specific IOCTLs. IOCTL commands must be defined using * the standard numbering system found in <asm/ioctl.h>. * * Locking: none. * Interrupts: caller dependent. * * @poll_init: ``int ()(struct uart_port *port)`` * * Called by kgdb to perform the minimal hardware initialization needed to * support @poll_put_char() and @poll_get_char(). Unlike @startup(), this * should not request interrupts. * * Locking: %tty_mutex and tty_port->mutex taken. * Interrupts: n/a. * * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)`` * * Called by kgdb to write a single character @ch directly to the serial * @port. It can and should block until there is space in the TX FIFO. * * Locking: none. * Interrupts: caller dependent. * This call must not sleep * * @poll_get_char: ``int ()(struct uart_port *port)`` * * Called by kgdb to read a single character directly from the serial * port. If data is available, it should be returned; otherwise the * function should return %NO_POLL_CHAR immediately. * * Locking: none. * Interrupts: caller dependent. * This call must not sleep */ struct uart_ops { unsigned int (*tx_empty)(struct uart_port *); void (*set_mctrl)(struct uart_port *, unsigned int mctrl); unsigned int (*get_mctrl)(struct uart_port *); void (*stop_tx)(struct uart_port *); void (*start_tx)(struct uart_port *); void (*throttle)(struct uart_port *); void (*unthrottle)(struct uart_port *); void (*send_xchar)(struct uart_port *, char ch); void (*stop_rx)(struct uart_port *); void (*start_rx)(struct uart_port *); void (*enable_ms)(struct uart_port *); void (*break_ctl)(struct uart_port *, int ctl); int (*startup)(struct uart_port *); void (*shutdown)(struct uart_port *); void (*flush_buffer)(struct uart_port *); void (*set_termios)(struct uart_port *, struct ktermios *new, const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); void (*pm)(struct uart_port *, unsigned int state, unsigned int oldstate); const char *(*type)(struct uart_port *); void (*release_port)(struct uart_port *); int (*request_port)(struct uart_port *); void (*config_port)(struct uart_port *, int); int (*verify_port)(struct uart_port *, struct serial_struct *); int (*ioctl)(struct uart_port *, unsigned int, unsigned long); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct uart_port *); void (*poll_put_char)(struct uart_port *, unsigned char); int (*poll_get_char)(struct uart_port *); #endif }; #define NO_POLL_CHAR 0x00ff0000 #define UART_CONFIG_TYPE (1 << 0) #define UART_CONFIG_IRQ (1 << 1) struct uart_icount { __u32 cts; __u32 dsr; __u32 rng; __u32 dcd; __u32 rx; __u32 tx; __u32 frame; __u32 overrun; __u32 parity; __u32 brk; __u32 buf_overrun; }; typedef u64 __bitwise upf_t; typedef unsigned int __bitwise upstat_t; struct uart_port { spinlock_t lock; /* port lock */ unsigned long iobase; /* in/out[bwl] */ unsigned char __iomem *membase; /* read/write[bwl] */ unsigned int (*serial_in)(struct uart_port *, int); void (*serial_out)(struct uart_port *, int, int); void (*set_termios)(struct uart_port *, struct ktermios *new, const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); unsigned int (*get_mctrl)(struct uart_port *); void (*set_mctrl)(struct uart_port *, unsigned int); unsigned int (*get_divisor)(struct uart_port *, unsigned int baud, unsigned int *frac); void (*set_divisor)(struct uart_port *, unsigned int baud, unsigned int quot, unsigned int quot_frac); int (*startup)(struct uart_port *port); void (*shutdown)(struct uart_port *port); void (*throttle)(struct uart_port *port); void (*unthrottle)(struct uart_port *port); int (*handle_irq)(struct uart_port *); void (*pm)(struct uart_port *, unsigned int state, unsigned int old); void (*handle_break)(struct uart_port *); int (*rs485_config)(struct uart_port *, struct ktermios *termios, struct serial_rs485 *rs485); int (*iso7816_config)(struct uart_port *, struct serial_iso7816 *iso7816); unsigned int ctrl_id; /* optional serial core controller id */ unsigned int port_id; /* optional serial core port id */ unsigned int irq; /* irq number */ unsigned long irqflags; /* irq flags */ unsigned int uartclk; /* base uart clock */ unsigned int fifosize; /* tx fifo size */ unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ unsigned char iotype; /* io access style */ unsigned char quirks; /* internal quirks */ #define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ #define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ #define UPIO_MEM (SERIAL_IO_MEM) /* driver-specific */ #define UPIO_MEM32 (SERIAL_IO_MEM32) /* 32b little endian */ #define UPIO_AU (SERIAL_IO_AU) /* Au1x00 and RT288x type IO */ #define UPIO_TSI (SERIAL_IO_TSI) /* Tsi108/109 type IO */ #define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ #define UPIO_MEM16 (SERIAL_IO_MEM16) /* 16b little endian */ /* quirks must be updated while holding port mutex */ #define UPQ_NO_TXEN_TEST BIT(0) unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ struct uart_state *state; /* pointer to parent state */ struct uart_icount icount; /* statistics */ struct console *cons; /* struct console, if any */ /* flags must be updated while holding port mutex */ upf_t flags; /* * These flags must be equivalent to the flags defined in * include/uapi/linux/tty_flags.h which are the userspace definitions * assigned from the serial_struct flags in uart_set_info() * [for bit definitions in the UPF_CHANGE_MASK] * * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable * The remaining bits are serial-core specific and not modifiable by * userspace. */ #define UPF_FOURPORT ((__force upf_t) ASYNC_FOURPORT /* 1 */ ) #define UPF_SAK ((__force upf_t) ASYNC_SAK /* 2 */ ) #define UPF_SPD_HI ((__force upf_t) ASYNC_SPD_HI /* 4 */ ) #define UPF_SPD_VHI ((__force upf_t) ASYNC_SPD_VHI /* 5 */ ) #define UPF_SPD_CUST ((__force upf_t) ASYNC_SPD_CUST /* 0x0030 */ ) #define UPF_SPD_WARP ((__force upf_t) ASYNC_SPD_WARP /* 0x1010 */ ) #define UPF_SPD_MASK ((__force upf_t) ASYNC_SPD_MASK /* 0x1030 */ ) #define UPF_SKIP_TEST ((__force upf_t) ASYNC_SKIP_TEST /* 6 */ ) #define UPF_AUTO_IRQ ((__force upf_t) ASYNC_AUTO_IRQ /* 7 */ ) #define UPF_HARDPPS_CD ((__force upf_t) ASYNC_HARDPPS_CD /* 11 */ ) #define UPF_SPD_SHI ((__force upf_t) ASYNC_SPD_SHI /* 12 */ ) #define UPF_LOW_LATENCY ((__force upf_t) ASYNC_LOW_LATENCY /* 13 */ ) #define UPF_BUGGY_UART ((__force upf_t) ASYNC_BUGGY_UART /* 14 */ ) #define UPF_MAGIC_MULTIPLIER ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ ) #define UPF_NO_THRE_TEST ((__force upf_t) BIT_ULL(19)) /* Port has hardware-assisted h/w flow control */ #define UPF_AUTO_CTS ((__force upf_t) BIT_ULL(20)) #define UPF_AUTO_RTS ((__force upf_t) BIT_ULL(21)) #define UPF_HARD_FLOW ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS)) /* Port has hardware-assisted s/w flow control */ #define UPF_SOFT_FLOW ((__force upf_t) BIT_ULL(22)) #define UPF_CONS_FLOW ((__force upf_t) BIT_ULL(23)) #define UPF_SHARE_IRQ ((__force upf_t) BIT_ULL(24)) #define UPF_EXAR_EFR ((__force upf_t) BIT_ULL(25)) #define UPF_BUG_THRE ((__force upf_t) BIT_ULL(26)) /* The exact UART type is known and should not be probed. */ #define UPF_FIXED_TYPE ((__force upf_t) BIT_ULL(27)) #define UPF_BOOT_AUTOCONF ((__force upf_t) BIT_ULL(28)) #define UPF_FIXED_PORT ((__force upf_t) BIT_ULL(29)) #define UPF_DEAD ((__force upf_t) BIT_ULL(30)) #define UPF_IOREMAP ((__force upf_t) BIT_ULL(31)) #define UPF_FULL_PROBE ((__force upf_t) BIT_ULL(32)) #define __UPF_CHANGE_MASK 0x17fff #define UPF_CHANGE_MASK ((__force upf_t) __UPF_CHANGE_MASK) #define UPF_USR_MASK ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY)) #if __UPF_CHANGE_MASK > ASYNC_FLAGS #error Change mask not equivalent to userspace-visible bit defines #endif /* * Must hold termios_rwsem, port mutex and port lock to change; * can hold any one lock to read. */ upstat_t status; #define UPSTAT_CTS_ENABLE ((__force upstat_t) (1 << 0)) #define UPSTAT_DCD_ENABLE ((__force upstat_t) (1 << 1)) #define UPSTAT_AUTORTS ((__force upstat_t) (1 << 2)) #define UPSTAT_AUTOCTS ((__force upstat_t) (1 << 3)) #define UPSTAT_AUTOXOFF ((__force upstat_t) (1 << 4)) #define UPSTAT_SYNC_FIFO ((__force upstat_t) (1 << 5)) bool hw_stopped; /* sw-assisted CTS flow state */ unsigned int mctrl; /* current modem ctrl settings */ unsigned int frame_time; /* frame timing in ns */ unsigned int type; /* port type */ const struct uart_ops *ops; unsigned int custom_divisor; unsigned int line; /* port index */ unsigned int minor; resource_size_t mapbase; /* for ioremap */ resource_size_t mapsize; struct device *dev; /* serial port physical parent device */ struct serial_port_device *port_dev; /* serial core port device */ unsigned long sysrq; /* sysrq timeout */ u8 sysrq_ch; /* char for sysrq */ unsigned char has_sysrq; unsigned char sysrq_seq; /* index in sysrq_toggle_seq */ unsigned char hub6; /* this should be in the 8250 driver */ unsigned char suspended; unsigned char console_reinit; const char *name; /* port name */ struct attribute_group *attr_group; /* port specific attributes */ const struct attribute_group **tty_groups; /* all attributes (serial core use only) */ struct serial_rs485 rs485; struct serial_rs485 rs485_supported; /* Supported mask for serial_rs485 */ struct gpio_desc *rs485_term_gpio; /* enable RS485 bus termination */ struct gpio_desc *rs485_rx_during_tx_gpio; /* Output GPIO that sets the state of RS485 RX during TX */ struct serial_iso7816 iso7816; void *private_data; /* generic platform data pointer */ }; static inline int serial_port_in(struct uart_port *up, int offset) { return up->serial_in(up, offset); } static inline void serial_port_out(struct uart_port *up, int offset, int value) { up->serial_out(up, offset, value); } /** * enum uart_pm_state - power states for UARTs * @UART_PM_STATE_ON: UART is powered, up and operational * @UART_PM_STATE_OFF: UART is powered off * @UART_PM_STATE_UNDEFINED: sentinel */ enum uart_pm_state { UART_PM_STATE_ON = 0, UART_PM_STATE_OFF = 3, /* number taken from ACPI */ UART_PM_STATE_UNDEFINED, }; /* * This is the state information which is persistent across opens. */ struct uart_state { struct tty_port port; enum uart_pm_state pm_state; struct circ_buf xmit; atomic_t refcount; wait_queue_head_t remove_wait; struct uart_port *uart_port; }; #define UART_XMIT_SIZE PAGE_SIZE /* number of characters left in xmit buffer before we ask for more */ #define WAKEUP_CHARS 256 /** * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars * @up: uart_port structure describing the port * @chars: number of characters sent * * This function advances the tail of circular xmit buffer by the number of * @chars transmitted and handles accounting of transmitted bytes (into * @up's icount.tx). */ static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars) { struct circ_buf *xmit = &up->state->xmit; xmit->tail = (xmit->tail + chars) & (UART_XMIT_SIZE - 1); up->icount.tx += chars; } struct module; struct tty_driver; struct uart_driver { struct module *owner; const char *driver_name; const char *dev_name; int major; int minor; int nr; struct console *cons; /* * these are private; the low level driver should not * touch these; they should be initialised to NULL */ struct uart_state *state; struct tty_driver *tty_driver; }; void uart_write_wakeup(struct uart_port *port); #define __uart_port_tx(uport, ch, tx_ready, put_char, tx_done, for_test, \ for_post) \ ({ \ struct uart_port *__port = (uport); \ struct circ_buf *xmit = &__port->state->xmit; \ unsigned int pending; \ \ for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) { \ if (__port->x_char) { \ (ch) = __port->x_char; \ (put_char); \ __port->x_char = 0; \ continue; \ } \ \ if (uart_circ_empty(xmit) || uart_tx_stopped(__port)) \ break; \ \ (ch) = xmit->buf[xmit->tail]; \ (put_char); \ xmit->tail = (xmit->tail + 1) % UART_XMIT_SIZE; \ } \ \ (tx_done); \ \ pending = uart_circ_chars_pending(xmit); \ if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ if (pending == 0) \ __port->ops->stop_tx(__port); \ } \ \ pending; \ }) /** * uart_port_tx_limited -- transmit helper for uart_port with count limiting * @port: uart port * @ch: variable to store a character to be written to the HW * @count: a limit of characters to send * @tx_ready: can HW accept more data function * @put_char: function to write a character * @tx_done: function to call after the loop is done * * This helper transmits characters from the xmit buffer to the hardware using * @put_char(). It does so until @count characters are sent and while @tx_ready * evaluates to true. * * Returns: the number of characters in the xmit buffer when done. * * The expression in macro parameters shall be designed as follows: * * **tx_ready:** should evaluate to true if the HW can accept more data to * be sent. This parameter can be %true, which means the HW is always ready. * * **put_char:** shall write @ch to the device of @port. * * **tx_done:** when the write loop is done, this can perform arbitrary * action before potential invocation of ops->stop_tx() happens. If the * driver does not need to do anything, use e.g. ({}). * * For all of them, @port->lock is held, interrupts are locally disabled and * the expressions must not sleep. */ #define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \ unsigned int __count = (count); \ __uart_port_tx(port, ch, tx_ready, put_char, tx_done, __count, \ __count--); \ }) /** * uart_port_tx -- transmit helper for uart_port * @port: uart port * @ch: variable to store a character to be written to the HW * @tx_ready: can HW accept more data function * @put_char: function to write a character * * See uart_port_tx_limited() for more details. */ #define uart_port_tx(port, ch, tx_ready, put_char) \ __uart_port_tx(port, ch, tx_ready, put_char, ({}), true, ({})) /* * Baud rate helpers. */ void uart_update_timeout(struct uart_port *port, unsigned int cflag, unsigned int baud); unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, const struct ktermios *old, unsigned int min, unsigned int max); unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud); /* * Calculates FIFO drain time. */ static inline unsigned long uart_fifo_timeout(struct uart_port *port) { u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize; /* Add .02 seconds of slop */ fifo_timeout += 20 * NSEC_PER_MSEC; return max(nsecs_to_jiffies(fifo_timeout), 1UL); } /* Base timer interval for polling */ static inline int uart_poll_timeout(struct uart_port *port) { int timeout = uart_fifo_timeout(port); return timeout > 6 ? (timeout / 2 - 2) : 1; } /* * Console helpers. */ struct earlycon_device { struct console *con; struct uart_port port; char options[32]; /* e.g., 115200n8 */ unsigned int baud; }; struct earlycon_id { char name[15]; char name_term; /* In case compiler didn't '\0' term name */ char compatible[128]; int (*setup)(struct earlycon_device *, const char *options); }; extern const struct earlycon_id __earlycon_table[]; extern const struct earlycon_id __earlycon_table_end[]; #if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE) #define EARLYCON_USED_OR_UNUSED __used #else #define EARLYCON_USED_OR_UNUSED __maybe_unused #endif #define OF_EARLYCON_DECLARE(_name, compat, fn) \ static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \ EARLYCON_USED_OR_UNUSED __section("__earlycon_table") \ __aligned(__alignof__(struct earlycon_id)) \ = { .name = __stringify(_name), \ .compatible = compat, \ .setup = fn } #define EARLYCON_DECLARE(_name, fn) OF_EARLYCON_DECLARE(_name, "", fn) int of_setup_earlycon(const struct earlycon_id *match, unsigned long node, const char *options); #ifdef CONFIG_SERIAL_EARLYCON extern bool earlycon_acpi_spcr_enable __initdata; int setup_earlycon(char *buf); #else static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED; static inline int setup_earlycon(char *buf) { return 0; } #endif /* Variant of uart_console_registered() when the console_list_lock is held. */ static inline bool uart_console_registered_locked(struct uart_port *port) { return uart_console(port) && console_is_registered_locked(port->cons); } static inline bool uart_console_registered(struct uart_port *port) { return uart_console(port) && console_is_registered(port->cons); } struct uart_port *uart_get_console(struct uart_port *ports, int nr, struct console *c); int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, char **options); void uart_parse_options(const char *options, int *baud, int *parity, int *bits, int *flow); int uart_set_options(struct uart_port *port, struct console *co, int baud, int parity, int bits, int flow); struct tty_driver *uart_console_device(struct console *co, int *index); void uart_console_write(struct uart_port *port, const char *s, unsigned int count, void (*putchar)(struct uart_port *, unsigned char)); /* * Port/driver registration/removal */ int uart_register_driver(struct uart_driver *uart); void uart_unregister_driver(struct uart_driver *uart); int uart_add_one_port(struct uart_driver *reg, struct uart_port *port); void uart_remove_one_port(struct uart_driver *reg, struct uart_port *port); bool uart_match_port(const struct uart_port *port1, const struct uart_port *port2); /* * Power Management */ int uart_suspend_port(struct uart_driver *reg, struct uart_port *port); int uart_resume_port(struct uart_driver *reg, struct uart_port *port); #define uart_circ_empty(circ) ((circ)->head == (circ)->tail) #define uart_circ_clear(circ) ((circ)->head = (circ)->tail = 0) #define uart_circ_chars_pending(circ) \ (CIRC_CNT((circ)->head, (circ)->tail, UART_XMIT_SIZE)) #define uart_circ_chars_free(circ) \ (CIRC_SPACE((circ)->head, (circ)->tail, UART_XMIT_SIZE)) static inline int uart_tx_stopped(struct uart_port *port) { struct tty_struct *tty = port->state->port.tty; if ((tty && tty->flow.stopped) || port->hw_stopped) return 1; return 0; } static inline bool uart_cts_enabled(struct uart_port *uport) { return !!(uport->status & UPSTAT_CTS_ENABLE); } static inline bool uart_softcts_mode(struct uart_port *uport) { upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS; return ((uport->status & mask) == UPSTAT_CTS_ENABLE); } /* * The following are helper functions for the low level drivers. */ void uart_handle_dcd_change(struct uart_port *uport, bool active); void uart_handle_cts_change(struct uart_port *uport, bool active); void uart_insert_char(struct uart_port *port, unsigned int status, unsigned int overrun, u8 ch, u8 flag); void uart_xchar_out(struct uart_port *uport, int offset); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL #define SYSRQ_TIMEOUT (HZ * 5) bool uart_try_toggle_sysrq(struct uart_port *port, u8 ch); static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch) { if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { if (sysrq_mask()) { handle_sysrq(ch); port->sysrq = 0; return 1; } if (uart_try_toggle_sysrq(port, ch)) return 1; } port->sysrq = 0; return 0; } static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch) { if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { if (sysrq_mask()) { port->sysrq_ch = ch; port->sysrq = 0; return 1; } if (uart_try_toggle_sysrq(port, ch)) return 1; } port->sysrq = 0; return 0; } static inline void uart_unlock_and_check_sysrq(struct uart_port *port) { u8 sysrq_ch; if (!port->has_sysrq) { spin_unlock(&port->lock); return; } sysrq_ch = port->sysrq_ch; port->sysrq_ch = 0; spin_unlock(&port->lock); if (sysrq_ch) handle_sysrq(sysrq_ch); } static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, unsigned long flags) { u8 sysrq_ch; if (!port->has_sysrq) { spin_unlock_irqrestore(&port->lock, flags); return; } sysrq_ch = port->sysrq_ch; port->sysrq_ch = 0; spin_unlock_irqrestore(&port->lock, flags); if (sysrq_ch) handle_sysrq(sysrq_ch); } #else /* CONFIG_MAGIC_SYSRQ_SERIAL */ static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch) { return 0; } static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch) { return 0; } static inline void uart_unlock_and_check_sysrq(struct uart_port *port) { spin_unlock(&port->lock); } static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, unsigned long flags) { spin_unlock_irqrestore(&port->lock, flags); } #endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ /* * We do the SysRQ and SAK checking like this... */ static inline int uart_handle_break(struct uart_port *port) { struct uart_state *state = port->state; if (port->handle_break) port->handle_break(port); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL if (port->has_sysrq && uart_console(port)) { if (!port->sysrq) { port->sysrq = jiffies + SYSRQ_TIMEOUT; return 1; } port->sysrq = 0; } #endif if (port->flags & UPF_SAK) do_SAK(state->port.tty); return 0; } /* * UART_ENABLE_MS - determine if port should enable modem status irqs */ #define UART_ENABLE_MS(port,cflag) ((port)->flags & UPF_HARDPPS_CD || \ (cflag) & CRTSCTS || \ !((cflag) & CLOCAL)) int uart_get_rs485_mode(struct uart_port *port); #endif /* LINUX_SERIAL_CORE_H */
379 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 #ifndef _UAPI_LINUX_VIRTIO_RING_H #define _UAPI_LINUX_VIRTIO_RING_H /* An interface for efficient virtio implementation, currently for use by KVM, * but hopefully others soon. Do NOT change this since it will * break existing servers and clients. * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of IBM nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright Rusty Russell IBM Corporation 2007. */ #ifndef __KERNEL__ #include <stdint.h> #endif #include <linux/types.h> #include <linux/virtio_types.h> /* This marks a buffer as continuing via the next field. */ #define VRING_DESC_F_NEXT 1 /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 /* This means the buffer contains a list of buffer descriptors. */ #define VRING_DESC_F_INDIRECT 4 /* * Mark a descriptor as available or used in packed ring. * Notice: they are defined as shifts instead of shifted values. */ #define VRING_PACKED_DESC_F_AVAIL 7 #define VRING_PACKED_DESC_F_USED 15 /* The Host uses this in used->flags to advise the Guest: don't kick me when * you add a buffer. It's unreliable, so it's simply an optimization. Guest * will still kick if it's out of buffers. */ #define VRING_USED_F_NO_NOTIFY 1 /* The Guest uses this in avail->flags to advise the Host: don't interrupt me * when you consume a buffer. It's unreliable, so it's simply an * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 /* Enable events in packed ring. */ #define VRING_PACKED_EVENT_FLAG_ENABLE 0x0 /* Disable events in packed ring. */ #define VRING_PACKED_EVENT_FLAG_DISABLE 0x1 /* * Enable events for a specific descriptor in packed ring. * (as specified by Descriptor Ring Change Event Offset/Wrap Counter). * Only valid if VIRTIO_RING_F_EVENT_IDX has been negotiated. */ #define VRING_PACKED_EVENT_FLAG_DESC 0x2 /* * Wrap counter bit shift in event suppression structure * of packed ring. */ #define VRING_PACKED_EVENT_F_WRAP_CTR 15 /* We support indirect buffer descriptors */ #define VIRTIO_RING_F_INDIRECT_DESC 28 /* The Guest publishes the used index for which it expects an interrupt * at the end of the avail ring. Host should ignore the avail->flags field. */ /* The Host publishes the avail index for which it expects a kick * at the end of the used ring. Guest should ignore the used->flags field. */ #define VIRTIO_RING_F_EVENT_IDX 29 /* Alignment requirements for vring elements. * When using pre-virtio 1.0 layout, these fall out naturally. */ #define VRING_AVAIL_ALIGN_SIZE 2 #define VRING_USED_ALIGN_SIZE 4 #define VRING_DESC_ALIGN_SIZE 16 /** * struct vring_desc - Virtio ring descriptors, * 16 bytes long. These can chain together via @next. * * @addr: buffer address (guest-physical) * @len: buffer length * @flags: descriptor flags * @next: index of the next descriptor in the chain, * if the VRING_DESC_F_NEXT flag is set. We chain unused * descriptors via this, too. */ struct vring_desc { __virtio64 addr; __virtio32 len; __virtio16 flags; __virtio16 next; }; struct vring_avail { __virtio16 flags; __virtio16 idx; __virtio16 ring[]; }; /* u32 is used here for ids for padding reasons. */ struct vring_used_elem { /* Index of start of used descriptor chain. */ __virtio32 id; /* Total length of the descriptor chain which was used (written to) */ __virtio32 len; }; typedef struct vring_used_elem __attribute__((aligned(VRING_USED_ALIGN_SIZE))) vring_used_elem_t; struct vring_used { __virtio16 flags; __virtio16 idx; vring_used_elem_t ring[]; }; /* * The ring element addresses are passed between components with different * alignments assumptions. Thus, we might need to decrease the compiler-selected * alignment, and so must use a typedef to make sure the aligned attribute * actually takes hold: * * https://gcc.gnu.org/onlinedocs//gcc/Common-Type-Attributes.html#Common-Type-Attributes * * When used on a struct, or struct member, the aligned attribute can only * increase the alignment; in order to decrease it, the packed attribute must * be specified as well. When used as part of a typedef, the aligned attribute * can both increase and decrease alignment, and specifying the packed * attribute generates a warning. */ typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE))) vring_desc_t; typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE))) vring_avail_t; typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE))) vring_used_t; struct vring { unsigned int num; vring_desc_t *desc; vring_avail_t *avail; vring_used_t *used; }; #ifndef VIRTIO_RING_NO_LEGACY /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * * struct vring * { * // The actual descriptors (16 bytes each) * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. * __virtio16 avail_flags; * __virtio16 avail_idx; * __virtio16 available[num]; * __virtio16 used_event_idx; * * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. * __virtio16 used_flags; * __virtio16 used_idx; * struct vring_used_elem used[num]; * __virtio16 avail_event_idx; * }; */ /* We publish the used event index at the end of the available ring, and vice * versa. They are at the end for backwards compatibility. */ #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) #define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num]) static inline void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long align) { vr->num = num; vr->desc = p; vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } static inline unsigned vring_size(unsigned int num, unsigned long align) { return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num) + align - 1) & ~(align - 1)) + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num; } #endif /* VIRTIO_RING_NO_LEGACY */ /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ /* Assuming a given event_idx value from the other side, if * we have just incremented index from old to new_idx, * should we trigger an event? */ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old) { /* Note: Xen has similar logic for notification hold-off * in include/xen/interface/io/ring.h with req_event and req_prod * corresponding to event_idx + 1 and new_idx respectively. * Note also that req_event and req_prod in Xen start at 1, * event indexes in virtio start at 0. */ return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old); } struct vring_packed_desc_event { /* Descriptor Ring Change Event Offset/Wrap Counter. */ __le16 off_wrap; /* Descriptor Ring Change Event Flags. */ __le16 flags; }; struct vring_packed_desc { /* Buffer Address. */ __le64 addr; /* Buffer Length. */ __le32 len; /* Buffer ID. */ __le16 id; /* The flags depending on descriptor type. */ __le16 flags; }; #endif /* _UAPI_LINUX_VIRTIO_RING_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 /* * ppp_mppe.c - interface MPPE to the PPP code. * This version is for use with Linux kernel 2.6.14+ * * By Frank Cusack <fcusack@fcusack.com>. * Copyright (c) 2002,2003,2004 Google, Inc. * All rights reserved. * * License: * Permission to use, copy, modify, and distribute this software and its * documentation is hereby granted, provided that the above copyright * notice appears in all copies. This software is provided without any * warranty, express or implied. * * ALTERNATIVELY, provided that this notice is retained in full, this product * may be distributed under the terms of the GNU General Public License (GPL), * in which case the provisions of the GPL apply INSTEAD OF those given above. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * * * Changelog: * 08/12/05 - Matt Domsch <Matt_Domsch@dell.com> * Only need extra skb padding on transmit, not receive. * 06/18/04 - Matt Domsch <Matt_Domsch@dell.com>, Oleg Makarenko <mole@quadra.ru> * Use Linux kernel 2.6 arc4 and sha1 routines rather than * providing our own. * 2/15/04 - TS: added #include <version.h> and testing for Kernel * version before using * MOD_DEC_USAGE_COUNT/MOD_INC_USAGE_COUNT which are * deprecated in 2.6 */ #include <crypto/arc4.h> #include <crypto/hash.h> #include <linux/err.h> #include <linux/fips.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/ppp_defs.h> #include <linux/ppp-comp.h> #include <linux/scatterlist.h> #include <asm/unaligned.h> #include "ppp_mppe.h" MODULE_AUTHOR("Frank Cusack <fcusack@fcusack.com>"); MODULE_DESCRIPTION("Point-to-Point Protocol Microsoft Point-to-Point Encryption support"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("ppp-compress-" __stringify(CI_MPPE)); MODULE_VERSION("1.0.2"); #define SHA1_PAD_SIZE 40 /* * kernel crypto API needs its arguments to be in kmalloc'd memory, not in the module * static data area. That means sha_pad needs to be kmalloc'd. */ struct sha_pad { unsigned char sha_pad1[SHA1_PAD_SIZE]; unsigned char sha_pad2[SHA1_PAD_SIZE]; }; static struct sha_pad *sha_pad; static inline void sha_pad_init(struct sha_pad *shapad) { memset(shapad->sha_pad1, 0x00, sizeof(shapad->sha_pad1)); memset(shapad->sha_pad2, 0xF2, sizeof(shapad->sha_pad2)); } /* * State for an MPPE (de)compressor. */ struct ppp_mppe_state { struct arc4_ctx arc4; struct shash_desc *sha1; unsigned char *sha1_digest; unsigned char master_key[MPPE_MAX_KEY_LEN]; unsigned char session_key[MPPE_MAX_KEY_LEN]; unsigned keylen; /* key length in bytes */ /* NB: 128-bit == 16, 40-bit == 8! */ /* If we want to support 56-bit, */ /* the unit has to change to bits */ unsigned char bits; /* MPPE control bits */ unsigned ccount; /* 12-bit coherency count (seqno) */ unsigned stateful; /* stateful mode flag */ int discard; /* stateful mode packet loss flag */ int sanity_errors; /* take down LCP if too many */ int unit; int debug; struct compstat stats; }; /* struct ppp_mppe_state.bits definitions */ #define MPPE_BIT_A 0x80 /* Encryption table were (re)inititalized */ #define MPPE_BIT_B 0x40 /* MPPC only (not implemented) */ #define MPPE_BIT_C 0x20 /* MPPC only (not implemented) */ #define MPPE_BIT_D 0x10 /* This is an encrypted frame */ #define MPPE_BIT_FLUSHED MPPE_BIT_A #define MPPE_BIT_ENCRYPTED MPPE_BIT_D #define MPPE_BITS(p) ((p)[4] & 0xf0) #define MPPE_CCOUNT(p) ((((p)[4] & 0x0f) << 8) + (p)[5]) #define MPPE_CCOUNT_SPACE 0x1000 /* The size of the ccount space */ #define MPPE_OVHD 2 /* MPPE overhead/packet */ #define SANITY_MAX 1600 /* Max bogon factor we will tolerate */ /* * Key Derivation, from RFC 3078, RFC 3079. * Equivalent to Get_Key() for MS-CHAP as described in RFC 3079. */ static void get_new_key_from_sha(struct ppp_mppe_state * state) { crypto_shash_init(state->sha1); crypto_shash_update(state->sha1, state->master_key, state->keylen); crypto_shash_update(state->sha1, sha_pad->sha_pad1, sizeof(sha_pad->sha_pad1)); crypto_shash_update(state->sha1, state->session_key, state->keylen); crypto_shash_update(state->sha1, sha_pad->sha_pad2, sizeof(sha_pad->sha_pad2)); crypto_shash_final(state->sha1, state->sha1_digest); } /* * Perform the MPPE rekey algorithm, from RFC 3078, sec. 7.3. * Well, not what's written there, but rather what they meant. */ static void mppe_rekey(struct ppp_mppe_state * state, int initial_key) { get_new_key_from_sha(state); if (!initial_key) { arc4_setkey(&state->arc4, state->sha1_digest, state->keylen); arc4_crypt(&state->arc4, state->session_key, state->sha1_digest, state->keylen); } else { memcpy(state->session_key, state->sha1_digest, state->keylen); } if (state->keylen == 8) { /* See RFC 3078 */ state->session_key[0] = 0xd1; state->session_key[1] = 0x26; state->session_key[2] = 0x9e; } arc4_setkey(&state->arc4, state->session_key, state->keylen); } /* * Allocate space for a (de)compressor. */ static void *mppe_alloc(unsigned char *options, int optlen) { struct ppp_mppe_state *state; struct crypto_shash *shash; unsigned int digestsize; if (optlen != CILEN_MPPE + sizeof(state->master_key) || options[0] != CI_MPPE || options[1] != CILEN_MPPE || fips_enabled) goto out; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) goto out; shash = crypto_alloc_shash("sha1", 0, 0); if (IS_ERR(shash)) goto out_free; state->sha1 = kmalloc(sizeof(*state->sha1) + crypto_shash_descsize(shash), GFP_KERNEL); if (!state->sha1) { crypto_free_shash(shash); goto out_free; } state->sha1->tfm = shash; digestsize = crypto_shash_digestsize(shash); if (digestsize < MPPE_MAX_KEY_LEN) goto out_free; state->sha1_digest = kmalloc(digestsize, GFP_KERNEL); if (!state->sha1_digest) goto out_free; /* Save keys. */ memcpy(state->master_key, &options[CILEN_MPPE], sizeof(state->master_key)); memcpy(state->session_key, state->master_key, sizeof(state->master_key)); /* * We defer initial key generation until mppe_init(), as mppe_alloc() * is called frequently during negotiation. */ return (void *)state; out_free: kfree(state->sha1_digest); if (state->sha1) { crypto_free_shash(state->sha1->tfm); kfree_sensitive(state->sha1); } kfree(state); out: return NULL; } /* * Deallocate space for a (de)compressor. */ static void mppe_free(void *arg) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; if (state) { kfree(state->sha1_digest); crypto_free_shash(state->sha1->tfm); kfree_sensitive(state->sha1); kfree_sensitive(state); } } /* * Initialize (de)compressor state. */ static int mppe_init(void *arg, unsigned char *options, int optlen, int unit, int debug, const char *debugstr) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; unsigned char mppe_opts; if (optlen != CILEN_MPPE || options[0] != CI_MPPE || options[1] != CILEN_MPPE) return 0; MPPE_CI_TO_OPTS(&options[2], mppe_opts); if (mppe_opts & MPPE_OPT_128) state->keylen = 16; else if (mppe_opts & MPPE_OPT_40) state->keylen = 8; else { printk(KERN_WARNING "%s[%d]: unknown key length\n", debugstr, unit); return 0; } if (mppe_opts & MPPE_OPT_STATEFUL) state->stateful = 1; /* Generate the initial session key. */ mppe_rekey(state, 1); if (debug) { printk(KERN_DEBUG "%s[%d]: initialized with %d-bit %s mode\n", debugstr, unit, (state->keylen == 16) ? 128 : 40, (state->stateful) ? "stateful" : "stateless"); printk(KERN_DEBUG "%s[%d]: keys: master: %*phN initial session: %*phN\n", debugstr, unit, (int)sizeof(state->master_key), state->master_key, (int)sizeof(state->session_key), state->session_key); } /* * Initialize the coherency count. The initial value is not specified * in RFC 3078, but we can make a reasonable assumption that it will * start at 0. Setting it to the max here makes the comp/decomp code * do the right thing (determined through experiment). */ state->ccount = MPPE_CCOUNT_SPACE - 1; /* * Note that even though we have initialized the key table, we don't * set the FLUSHED bit. This is contrary to RFC 3078, sec. 3.1. */ state->bits = MPPE_BIT_ENCRYPTED; state->unit = unit; state->debug = debug; return 1; } static int mppe_comp_init(void *arg, unsigned char *options, int optlen, int unit, int hdrlen, int debug) { /* ARGSUSED */ return mppe_init(arg, options, optlen, unit, debug, "mppe_comp_init"); } /* * We received a CCP Reset-Request (actually, we are sending a Reset-Ack), * tell the compressor to rekey. Note that we MUST NOT rekey for * every CCP Reset-Request; we only rekey on the next xmit packet. * We might get multiple CCP Reset-Requests if our CCP Reset-Ack is lost. * So, rekeying for every CCP Reset-Request is broken as the peer will not * know how many times we've rekeyed. (If we rekey and THEN get another * CCP Reset-Request, we must rekey again.) */ static void mppe_comp_reset(void *arg) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; state->bits |= MPPE_BIT_FLUSHED; } /* * Compress (encrypt) a packet. * It's strange to call this a compressor, since the output is always * MPPE_OVHD + 2 bytes larger than the input. */ static int mppe_compress(void *arg, unsigned char *ibuf, unsigned char *obuf, int isize, int osize) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; int proto; /* * Check that the protocol is in the range we handle. */ proto = PPP_PROTOCOL(ibuf); if (proto < 0x0021 || proto > 0x00fa) return 0; /* Make sure we have enough room to generate an encrypted packet. */ if (osize < isize + MPPE_OVHD + 2) { /* Drop the packet if we should encrypt it, but can't. */ printk(KERN_DEBUG "mppe_compress[%d]: osize too small! " "(have: %d need: %d)\n", state->unit, osize, osize + MPPE_OVHD + 2); return -1; } osize = isize + MPPE_OVHD + 2; /* * Copy over the PPP header and set control bits. */ obuf[0] = PPP_ADDRESS(ibuf); obuf[1] = PPP_CONTROL(ibuf); put_unaligned_be16(PPP_COMP, obuf + 2); obuf += PPP_HDRLEN; state->ccount = (state->ccount + 1) % MPPE_CCOUNT_SPACE; if (state->debug >= 7) printk(KERN_DEBUG "mppe_compress[%d]: ccount %d\n", state->unit, state->ccount); put_unaligned_be16(state->ccount, obuf); if (!state->stateful || /* stateless mode */ ((state->ccount & 0xff) == 0xff) || /* "flag" packet */ (state->bits & MPPE_BIT_FLUSHED)) { /* CCP Reset-Request */ /* We must rekey */ if (state->debug && state->stateful) printk(KERN_DEBUG "mppe_compress[%d]: rekeying\n", state->unit); mppe_rekey(state, 0); state->bits |= MPPE_BIT_FLUSHED; } obuf[0] |= state->bits; state->bits &= ~MPPE_BIT_FLUSHED; /* reset for next xmit */ obuf += MPPE_OVHD; ibuf += 2; /* skip to proto field */ isize -= 2; arc4_crypt(&state->arc4, obuf, ibuf, isize); state->stats.unc_bytes += isize; state->stats.unc_packets++; state->stats.comp_bytes += osize; state->stats.comp_packets++; return osize; } /* * Since every frame grows by MPPE_OVHD + 2 bytes, this is always going * to look bad ... and the longer the link is up the worse it will get. */ static void mppe_comp_stats(void *arg, struct compstat *stats) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; *stats = state->stats; } static int mppe_decomp_init(void *arg, unsigned char *options, int optlen, int unit, int hdrlen, int mru, int debug) { /* ARGSUSED */ return mppe_init(arg, options, optlen, unit, debug, "mppe_decomp_init"); } /* * We received a CCP Reset-Ack. Just ignore it. */ static void mppe_decomp_reset(void *arg) { /* ARGSUSED */ return; } /* * Decompress (decrypt) an MPPE packet. */ static int mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, int osize) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; unsigned ccount; int flushed = MPPE_BITS(ibuf) & MPPE_BIT_FLUSHED; if (isize <= PPP_HDRLEN + MPPE_OVHD) { if (state->debug) printk(KERN_DEBUG "mppe_decompress[%d]: short pkt (%d)\n", state->unit, isize); return DECOMP_ERROR; } /* * Make sure we have enough room to decrypt the packet. * Note that for our test we only subtract 1 byte whereas in * mppe_compress() we added 2 bytes (+MPPE_OVHD); * this is to account for possible PFC. */ if (osize < isize - MPPE_OVHD - 1) { printk(KERN_DEBUG "mppe_decompress[%d]: osize too small! " "(have: %d need: %d)\n", state->unit, osize, isize - MPPE_OVHD - 1); return DECOMP_ERROR; } osize = isize - MPPE_OVHD - 2; /* assume no PFC */ ccount = MPPE_CCOUNT(ibuf); if (state->debug >= 7) printk(KERN_DEBUG "mppe_decompress[%d]: ccount %d\n", state->unit, ccount); /* sanity checks -- terminate with extreme prejudice */ if (!(MPPE_BITS(ibuf) & MPPE_BIT_ENCRYPTED)) { printk(KERN_DEBUG "mppe_decompress[%d]: ENCRYPTED bit not set!\n", state->unit); state->sanity_errors += 100; goto sanity_error; } if (!state->stateful && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set in " "stateless mode!\n", state->unit); state->sanity_errors += 100; goto sanity_error; } if (state->stateful && ((ccount & 0xff) == 0xff) && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set on " "flag packet!\n", state->unit); state->sanity_errors += 100; goto sanity_error; } /* * Check the coherency count. */ if (!state->stateful) { /* Discard late packet */ if ((ccount - state->ccount) % MPPE_CCOUNT_SPACE > MPPE_CCOUNT_SPACE / 2) { state->sanity_errors++; goto sanity_error; } /* RFC 3078, sec 8.1. Rekey for every packet. */ while (state->ccount != ccount) { mppe_rekey(state, 0); state->ccount = (state->ccount + 1) % MPPE_CCOUNT_SPACE; } } else { /* RFC 3078, sec 8.2. */ if (!state->discard) { /* normal state */ state->ccount = (state->ccount + 1) % MPPE_CCOUNT_SPACE; if (ccount != state->ccount) { /* * (ccount > state->ccount) * Packet loss detected, enter the discard state. * Signal the peer to rekey (by sending a CCP Reset-Request). */ state->discard = 1; return DECOMP_ERROR; } } else { /* discard state */ if (!flushed) { /* ccp.c will be silent (no additional CCP Reset-Requests). */ return DECOMP_ERROR; } else { /* Rekey for every missed "flag" packet. */ while ((ccount & ~0xff) != (state->ccount & ~0xff)) { mppe_rekey(state, 0); state->ccount = (state->ccount + 256) % MPPE_CCOUNT_SPACE; } /* reset */ state->discard = 0; state->ccount = ccount; /* * Another problem with RFC 3078 here. It implies that the * peer need not send a Reset-Ack packet. But RFC 1962 * requires it. Hopefully, M$ does send a Reset-Ack; even * though it isn't required for MPPE synchronization, it is * required to reset CCP state. */ } } if (flushed) mppe_rekey(state, 0); } /* * Fill in the first part of the PPP header. The protocol field * comes from the decrypted data. */ obuf[0] = PPP_ADDRESS(ibuf); /* +1 */ obuf[1] = PPP_CONTROL(ibuf); /* +1 */ obuf += 2; ibuf += PPP_HDRLEN + MPPE_OVHD; isize -= PPP_HDRLEN + MPPE_OVHD; /* -6 */ /* net osize: isize-4 */ /* * Decrypt the first byte in order to check if it is * a compressed or uncompressed protocol field. */ arc4_crypt(&state->arc4, obuf, ibuf, 1); /* * Do PFC decompression. * This would be nicer if we were given the actual sk_buff * instead of a char *. */ if ((obuf[0] & 0x01) != 0) { obuf[1] = obuf[0]; obuf[0] = 0; obuf++; osize++; } /* And finally, decrypt the rest of the packet. */ arc4_crypt(&state->arc4, obuf + 1, ibuf + 1, isize - 1); state->stats.unc_bytes += osize; state->stats.unc_packets++; state->stats.comp_bytes += isize; state->stats.comp_packets++; /* good packet credit */ state->sanity_errors >>= 1; return osize; sanity_error: if (state->sanity_errors < SANITY_MAX) return DECOMP_ERROR; else /* Take LCP down if the peer is sending too many bogons. * We don't want to do this for a single or just a few * instances since it could just be due to packet corruption. */ return DECOMP_FATALERROR; } /* * Incompressible data has arrived (this should never happen!). * We should probably drop the link if the protocol is in the range * of what should be encrypted. At the least, we should drop this * packet. (How to do this?) */ static void mppe_incomp(void *arg, unsigned char *ibuf, int icnt) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; if (state->debug && (PPP_PROTOCOL(ibuf) >= 0x0021 && PPP_PROTOCOL(ibuf) <= 0x00fa)) printk(KERN_DEBUG "mppe_incomp[%d]: incompressible (unencrypted) data! " "(proto %04x)\n", state->unit, PPP_PROTOCOL(ibuf)); state->stats.inc_bytes += icnt; state->stats.inc_packets++; state->stats.unc_bytes += icnt; state->stats.unc_packets++; } /************************************************************* * Module interface table *************************************************************/ /* * Procedures exported to if_ppp.c. */ static struct compressor ppp_mppe = { .compress_proto = CI_MPPE, .comp_alloc = mppe_alloc, .comp_free = mppe_free, .comp_init = mppe_comp_init, .comp_reset = mppe_comp_reset, .compress = mppe_compress, .comp_stat = mppe_comp_stats, .decomp_alloc = mppe_alloc, .decomp_free = mppe_free, .decomp_init = mppe_decomp_init, .decomp_reset = mppe_decomp_reset, .decompress = mppe_decompress, .incomp = mppe_incomp, .decomp_stat = mppe_comp_stats, .owner = THIS_MODULE, .comp_extra = MPPE_PAD, }; /* * ppp_mppe_init() * * Prior to allowing load, try to load the arc4 and sha1 crypto * libraries. The actual use will be allocated later, but * this way the module will fail to insmod if they aren't available. */ static int __init ppp_mppe_init(void) { int answer; if (fips_enabled || !crypto_has_ahash("sha1", 0, CRYPTO_ALG_ASYNC)) return -ENODEV; sha_pad = kmalloc(sizeof(struct sha_pad), GFP_KERNEL); if (!sha_pad) return -ENOMEM; sha_pad_init(sha_pad); answer = ppp_register_compressor(&ppp_mppe); if (answer == 0) printk(KERN_INFO "PPP MPPE Compression module registered\n"); else kfree(sha_pad); return answer; } static void __exit ppp_mppe_cleanup(void) { ppp_unregister_compressor(&ppp_mppe); kfree(sha_pad); } module_init(ppp_mppe_init); module_exit(ppp_mppe_cleanup);
244 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 // SPDX-License-Identifier: GPL-2.0-only /* Expectation handling for nf_conntrack. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (c) 2005-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static siphash_aligned_key_t nf_ct_expect_hashrnd; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { struct { union nf_inet_addr dst_addr; u32 net_mix; u16 dport; u8 l3num; u8 protonum; } __aligned(SIPHASH_ALIGNMENT) combined; u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); memset(&combined, 0, sizeof(combined)); combined.dst_addr = tuple->dst.u3; combined.net_mix = net_hash_mix(n); combined.dport = (__force __u16)tuple->dst.u.all; combined.l3num = tuple->src.l3num; combined.protonum = tuple->dst.protonum; hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } static bool nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_expect *i, const struct nf_conntrack_zone *zone, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && net_eq(net, nf_ct_net(i->master)) && nf_ct_zone_equal_any(i->master, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; } return false; } EXPORT_SYMBOL_GPL(nf_ct_remove_expect); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; } EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); return i; } EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } } if (!exp) return NULL; /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (!nf_ct_is_confirmed(exp->master)) return NULL; /* Avoid race with other CPUs, that for exp->master ct, is * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } /* Undo exp->master refcnt increase, if del_timer() failed */ nf_ct_put(exp->master); return NULL; } /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); /* Would two expected things clash? */ static inline int expect_clash(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct nf_conntrack_tuple_mask intersect_mask; int count; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static bool master_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b, unsigned int flags) { if (flags & NF_CT_EXP_F_SKIP_MASTER) return true; return a->master == b->master; } /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; new->master = me; refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; if (family == AF_INET) len = 4; else len = 16; exp->flags = 0; exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); if (sizeof(exp->tuple.src.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.src.u3 + len, 0x00, sizeof(exp->tuple.src.u3) - len); memset(&exp->mask.src.u3, 0xFF, len); if (sizeof(exp->mask.src.u3) > len) memset((void *)&exp->mask.src.u3 + len, 0x00, sizeof(exp->mask.src.u3) - len); } else { memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } if (src) { exp->tuple.src.u.all = *src; exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } memcpy(&exp->tuple.dst.u3, daddr, len); if (sizeof(exp->tuple.dst.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.dst.u3 + len, 0x00, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; #if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); static void nf_ct_expect_free_rcu(struct rcu_head *head) { struct nf_conntrack_expect *exp; exp = container_of(head, struct nf_conntrack_expect, rcu); kmem_cache_free(nf_ct_expect_cachep, exp); } void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; } add_timer(&exp->timeout); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, struct nf_conntrack_expect *new) { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } if (last) nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; int ret = 0; if (!master_help) { ret = -ESHUTDOWN; goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (master_matches(i, expect, flags) && expect_matches(i, expect)) { if (i->class != expect->class || i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); if (master_help->expecting[expect->class] >= p->max_expected) { ret = -EMFILE; goto out; } } } cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } return NULL; } static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head = ct_expect_get_first(seq); if (head) while (pos && (head = ct_expect_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *exp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ct_expect_get_idx(seq, *pos); } static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return ct_expect_get_next(seq, v); } static void exp_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); delim = ","; } if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } seq_putc(s, '\n'); return 0; } static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, .show = exp_seq_show }; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); } int nf_conntrack_expect_init(void) { if (!nf_ct_expect_hsize) { nf_ct_expect_hsize = nf_conntrack_htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (!nf_ct_expect_hash) { kmem_cache_destroy(nf_ct_expect_cachep); return -ENOMEM; } return 0; } void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); kvfree(nf_ct_expect_hash); }
1 10 10 10 10 10 10 10 10 10 10 12 12 12 12 12 12 12 12 12 28 28 28 6 6 1 5 3 4 4 6 11 11 11 11 1 1 1 1 1 11 11 11 11 11 4 1 11 11 11 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2022 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <linux/rcupdate.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" static struct ieee80211_link_data * ieee80211_link_or_deflink(struct ieee80211_sub_if_data *sdata, int link_id, bool require_valid) { struct ieee80211_link_data *link; if (link_id < 0) { /* * For keys, if sdata is not an MLD, we might not use * the return value at all (if it's not a pairwise key), * so in that case (require_valid==false) don't error. */ if (require_valid && ieee80211_vif_is_mld(&sdata->vif)) return ERR_PTR(-EINVAL); return &sdata->deflink; } link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return ERR_PTR(-ENOLINK); return link; } static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { bool mu_mimo_groups = false; bool mu_mimo_follow = false; if (params->vht_mumimo_groups) { u64 membership; BUILD_BUG_ON(sizeof(membership) != WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.position, params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MU_GROUPS); /* don't care about endianness - just check for 0 */ memcpy(&membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); mu_mimo_groups = membership != 0; } if (params->vht_mumimo_follow_addr) { mu_mimo_follow = is_valid_ether_addr(params->vht_mumimo_follow_addr); ether_addr_copy(sdata->u.mntr.mu_follow_addr, params->vht_mumimo_follow_addr); } sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; } static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *monitor_sdata; /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE; /* * Prohibit MONITOR_FLAG_COOK_FRAMES and * MONITOR_FLAG_ACTIVE to be changed while the * interface is up. * Else we would need to add a lot of cruft * to update everything: * cooked_mntrs, monitor and all fif_* counters * reconfigure hardware */ if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) return -EBUSY; } /* also validate MU-MIMO change */ monitor_sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) return -EOPNOTSUPP; /* apply all changes now - no failures allowed */ if (monitor_sdata) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { if (ieee80211_sdata_running(sdata)) { ieee80211_adjust_monitor_flags(sdata, -1); sdata->u.mntr.flags = params->flags; ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); } else { /* * Because the interface is down, ieee80211_do_stop * and ieee80211_do_open take care of "everything" * mentioned in the comment above. */ sdata->u.mntr.flags = params->flags; } } return 0; } static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct cfg80211_mbssid_config params, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; if (sdata->vif.type != NL80211_IFTYPE_AP || !params.tx_wdev) return -EINVAL; tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params.tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { sdata->vif.mbssid_tx_vif = &sdata->vif; } else { sdata->vif.mbssid_tx_vif = &tx_sdata->vif; link_conf->nontransmitted = true; link_conf->bssid_index = params.index; } if (params.ema) link_conf->ema_ap = true; return 0; } static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct wireless_dev *wdev; struct ieee80211_sub_if_data *sdata; int err; err = ieee80211_if_add(local, name, name_assign_type, &wdev, type, params); if (err) return ERR_PTR(err); sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (type == NL80211_IFTYPE_MONITOR) { err = ieee80211_set_mon_options(sdata, params); if (err) { ieee80211_if_remove(sdata); return NULL; } } return wdev; } static int ieee80211_del_iface(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_if_remove(IEEE80211_WDEV_TO_SUB_IF(wdev)); return 0; } static int ieee80211_change_iface(struct wiphy *wiphy, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret; ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) { RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (params->use_4addr == ifmgd->use_4addr) return 0; /* FIXME: no support for 4-addr MLO yet */ if (ieee80211_vif_is_mld(&sdata->vif)) return -EOPNOTSUPP; sdata->u.mgd.use_4addr = params->use_4addr; if (!ifmgd->associated) return 0; mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid); if (sta) drv_sta_set_4addr(local, sdata, &sta->sta, params->use_4addr); mutex_unlock(&local->sta_mtx); if (params->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { ret = ieee80211_set_mon_options(sdata, params); if (ret) return ret; } return 0; } static int ieee80211_start_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; mutex_lock(&sdata->local->chanctx_mtx); ret = ieee80211_check_combinations(sdata, NULL, 0, 0); mutex_unlock(&sdata->local->chanctx_mtx); if (ret < 0) return ret; return ieee80211_do_open(wdev, true); } static void ieee80211_stop_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; mutex_lock(&sdata->local->chanctx_mtx); ret = ieee80211_check_combinations(sdata, NULL, 0, 0); mutex_unlock(&sdata->local->chanctx_mtx); if (ret < 0) return ret; ret = ieee80211_do_open(wdev, true); if (ret) return ret; ret = drv_start_nan(sdata->local, sdata, conf); if (ret) ieee80211_sdata_stop(sdata); sdata->u.nan.conf = *conf; return ret; } static void ieee80211_stop_nan(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); drv_stop_nan(sdata->local, sdata); ieee80211_sdata_stop(sdata); } static int ieee80211_nan_change_conf(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_conf new_conf; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; new_conf = sdata->u.nan.conf; if (changes & CFG80211_NAN_CONF_CHANGED_PREF) new_conf.master_pref = conf->master_pref; if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) new_conf.bands = conf->bands; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); if (!ret) sdata->u.nan.conf = new_conf; return ret; } static int ieee80211_add_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; spin_lock_bh(&sdata->u.nan.func_lock); ret = idr_alloc(&sdata->u.nan.function_inst_ids, nan_func, 1, sdata->local->hw.max_nan_de_entries + 1, GFP_ATOMIC); spin_unlock_bh(&sdata->u.nan.func_lock); if (ret < 0) return ret; nan_func->instance_id = ret; WARN_ON(nan_func->instance_id == 0); ret = drv_add_nan_func(sdata->local, sdata, nan_func); if (ret) { spin_lock_bh(&sdata->u.nan.func_lock); idr_remove(&sdata->u.nan.function_inst_ids, nan_func->instance_id); spin_unlock_bh(&sdata->u.nan.func_lock); } return ret; } static struct cfg80211_nan_func * ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata, u64 cookie) { struct cfg80211_nan_func *func; int id; lockdep_assert_held(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) { if (func->cookie == cookie) return func; } return NULL; } static void ieee80211_del_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_func *func; u8 instance_id = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN || !ieee80211_sdata_running(sdata)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = ieee80211_find_nan_func_by_cookie(sdata, cookie); if (func) instance_id = func->instance_id; spin_unlock_bh(&sdata->u.nan.func_lock); if (instance_id) drv_del_nan_func(sdata->local, sdata, instance_id); } static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; ieee80211_check_fast_xmit_iface(sdata); return 0; } static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, const u8 *mac_addr, u8 key_idx) { struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; struct sta_info *sta; int ret = -EINVAL; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) return -EINVAL; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return -EINVAL; if (sta->ptk_idx == key_idx) return 0; mutex_lock(&local->key_mtx); key = key_mtx_dereference(local, sta->ptk[key_idx]); if (key && key->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) ret = ieee80211_set_tx_key(key); mutex_unlock(&local->key_mtx); return ret; } static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); struct ieee80211_local *local = sdata->local; struct sta_info *sta = NULL; struct ieee80211_key *key; int err; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; if (IS_ERR(link)) return PTR_ERR(link); if (pairwise && params->mode == NL80211_KEY_SET_TX) return ieee80211_set_tx(sdata, mac_addr, key_idx); /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_WEP104: if (link_id >= 0) return -EINVAL; if (WARN_ON_ONCE(fips_enabled)) return -EINVAL; break; default: break; } key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, params->key, params->seq_len, params->seq); if (IS_ERR(key)) return PTR_ERR(key); key->conf.link_id = link_id; if (pairwise) key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; if (params->mode == NL80211_KEY_NO_TX) key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX; mutex_lock(&local->sta_mtx); if (mac_addr) { sta = sta_info_get_bss(sdata, mac_addr); /* * The ASSOC test makes sure the driver is ready to * receive the key. When wpa_supplicant has roamed * using FT, it attempts to set the key before * association has completed, this rejects that attempt * so it will set the key again after association. * * TODO: accept the key if we have a station entry and * add it to the device after the station. */ if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { ieee80211_key_free_unused(key); err = -ENOENT; goto out_unlock; } } switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: /* no MFP (yet) */ break; case NL80211_IFTYPE_MESH_POINT: #ifdef CONFIG_MAC80211_MESH if (sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; #endif case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_OCB: /* shouldn't happen */ WARN_ON_ONCE(1); break; } err = ieee80211_key_link(key, link, sta); /* KRACK protection, shouldn't happen but just silently accept key */ if (err == -EALREADY) err = 0; out_unlock: mutex_unlock(&local->sta_mtx); return err; } static struct ieee80211_key * ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_key *key; if (link_id >= 0) { link = rcu_dereference_check(sdata->link[link_id], lockdep_is_held(&sdata->wdev.mtx)); if (!link) return NULL; } if (mac_addr) { struct sta_info *sta; struct link_sta_info *link_sta; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return NULL; if (link_id >= 0) { link_sta = rcu_dereference_check(sta->link[link_id], lockdep_is_held(&local->sta_mtx)); if (!link_sta) return NULL; } else { link_sta = &sta->deflink; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return rcu_dereference_check_key_mtx(local, sta->ptk[key_idx]); if (!pairwise && key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) return rcu_dereference_check_key_mtx(local, link_sta->gtk[key_idx]); return NULL; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return rcu_dereference_check_key_mtx(local, sdata->keys[key_idx]); key = rcu_dereference_check_key_mtx(local, link->gtk[key_idx]); if (key) return key; /* or maybe it was a WEP key */ if (key_idx < NUM_DEFAULT_KEYS) return rcu_dereference_check_key_mtx(local, sdata->keys[key_idx]); return NULL; } static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; int ret; mutex_lock(&local->sta_mtx); mutex_lock(&local->key_mtx); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) { ret = -ENOENT; goto out_unlock; } ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); ret = 0; out_unlock: mutex_unlock(&local->key_mtx); mutex_unlock(&local->sta_mtx); return ret; } static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *params)) { struct ieee80211_sub_if_data *sdata; u8 seq[6] = {0}; struct key_params params; struct ieee80211_key *key; u64 pn64; u32 iv32; u16 iv16; int err = -ENOENT; struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) goto out; memset(&params, 0, sizeof(params)); params.cipher = key->conf.cipher; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: pn64 = atomic64_read(&key->conf.tx_pn); iv32 = TKIP_PN_TO_IV32(pn64); iv16 = TKIP_PN_TO_IV16(pn64); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); iv32 = kseq.tkip.iv32; iv16 = kseq.tkip.iv16; } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; seq[2] = iv32 & 0xff; seq[3] = (iv32 >> 8) & 0xff; seq[4] = (iv32 >> 16) & 0xff; seq[5] = (iv32 >> 24) & 0xff; params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); fallthrough; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); fallthrough; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), gcmp)); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); memcpy(seq, kseq.ccmp.pn, 6); } else { pn64 = atomic64_read(&key->conf.tx_pn); seq[0] = pn64; seq[1] = pn64 >> 8; seq[2] = pn64 >> 16; seq[3] = pn64 >> 24; seq[4] = pn64 >> 32; seq[5] = pn64 >> 40; } params.seq = seq; params.seq_len = 6; break; default: if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) break; if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) break; drv_get_key_seq(sdata->local, key, &kseq); params.seq = kseq.hw.seq; params.seq_len = kseq.hw.seq_len; break; } params.key = key->conf.key; params.key_len = key->conf.keylen; callback(cookie, &params); err = 0; out: rcu_read_unlock(); return err; } static int ieee80211_config_default_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool uni, bool multi) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_key(link, key_idx, uni, multi); return 0; } static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_mgmt_key(link, key_idx); return 0; } static int ieee80211_config_default_beacon_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_beacon_key(link, key_idx); return 0; } void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo) { rinfo->flags = 0; if (rate->flags & IEEE80211_TX_RC_MCS) { rinfo->flags |= RATE_INFO_FLAGS_MCS; rinfo->mcs = rate->idx; } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = ieee80211_rate_get_vht_mcs(rate); rinfo->nss = ieee80211_rate_get_vht_nss(rate); } else { struct ieee80211_supported_band *sband; int shift = ieee80211_vif_get_shift(&sta->sdata->vif); u16 brate; sband = ieee80211_get_sband(sta->sdata); WARN_ON_ONCE(sband && !sband->bitrates); if (sband && sband->bitrates) { brate = sband->bitrates[rate->idx].bitrate; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); } } if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_40; else if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_80; else if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_160; else rinfo->bw = RATE_INFO_BW_20; if (rate->flags & IEEE80211_TX_RC_SHORT_GI) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; mutex_lock(&local->sta_mtx); sta = sta_info_get_by_idx(sdata, idx); if (sta) { ret = 0; memcpy(mac, sta->sta.addr, ETH_ALEN); sta_set_sinfo(sta, sinfo, true); } mutex_unlock(&local->sta_mtx); return ret; } static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, int idx, struct survey_info *survey) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); return drv_get_survey(local, idx, survey); } static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mac); if (sta) { ret = 0; sta_set_sinfo(sta, sinfo, true); } mutex_unlock(&local->sta_mtx); return ret; } static int ieee80211_set_monitor_channel(struct wiphy *wiphy, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; int ret = 0; if (cfg80211_chandef_identical(&local->monitor_chandef, chandef)) return 0; if (local->use_chanctx) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata) { sdata_lock(sdata); mutex_lock(&local->mtx); ieee80211_link_release_channel(&sdata->deflink); ret = ieee80211_link_use_channel(&sdata->deflink, chandef, IEEE80211_CHANCTX_EXCLUSIVE); mutex_unlock(&local->mtx); sdata_unlock(sdata); } } else { mutex_lock(&local->mtx); if (local->open_count == local->monitors) { local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } mutex_unlock(&local->mtx); } if (ret == 0) local->monitor_chandef = *chandef; return ret; } static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, const u8 *resp, size_t resp_len, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, struct ieee80211_link_data *link) { struct probe_resp *new, *old; if (!resp || !resp_len) return 1; old = sdata_dereference(link->u.ap.probe_resp, sdata); new = kzalloc(sizeof(struct probe_resp) + resp_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = resp_len; memcpy(new->data, resp, resp_len); if (csa) memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * sizeof(new->cntdwn_counter_offsets[0])); else if (cca) new->cntdwn_counter_offsets[0] = cca->counter_offset_presp; rcu_assign_pointer(link->u.ap.probe_resp, new); if (old) kfree_rcu(old, rcu_head); return 0; } static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, struct cfg80211_fils_discovery *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf) { struct fils_discovery_data *new, *old = NULL; struct ieee80211_fils_discovery *fd; if (!params->tmpl || !params->tmpl_len) return -EINVAL; fd = &link_conf->fils_discovery; fd->min_interval = params->min_interval; fd->max_interval = params->max_interval; old = sdata_dereference(link->u.ap.fils_discovery, sdata); new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.fils_discovery, new); if (old) kfree_rcu(old, rcu_head); return 0; } static int ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_unsol_bcast_probe_resp *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf) { struct unsol_bcast_probe_resp_data *new, *old = NULL; if (!params->tmpl || !params->tmpl_len) return -EINVAL; old = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.unsol_bcast_probe_resp, new); if (old) kfree_rcu(old, rcu_head); link_conf->unsol_bcast_probe_resp_interval = params->interval; return 0; } static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, const u8 *civicloc, size_t civicloc_len, struct ieee80211_bss_conf *link_conf) { struct ieee80211_ftm_responder_params *new, *old; u8 *pos; int len; if (!lci_len && !civicloc_len) return 0; old = link_conf->ftmr_params; len = lci_len + civicloc_len; new = kzalloc(sizeof(*new) + len, GFP_KERNEL); if (!new) return -ENOMEM; pos = (u8 *)(new + 1); if (lci_len) { new->lci_len = lci_len; new->lci = pos; memcpy(pos, lci, lci_len); pos += lci_len; } if (civicloc_len) { new->civicloc_len = civicloc_len; new->civicloc = pos; memcpy(pos, civicloc, civicloc_len); pos += civicloc_len; } link_conf->ftmr_params = new; kfree(old); return 0; } static int ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst, struct cfg80211_mbssid_elems *src) { int i, offset = 0; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } dst->cnt = src->cnt; return offset; } static int ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst, struct cfg80211_rnr_elems *src) { int i, offset = 0; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } dst->cnt = src->cnt; return offset; } static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct cfg80211_beacon_data *params, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, u64 *changed) { struct cfg80211_mbssid_elems *mbssid = NULL; struct cfg80211_rnr_elems *rnr = NULL; struct beacon_data *new, *old; int new_head_len, new_tail_len; int size, err; u64 _changed = BSS_CHANGED_BEACON; struct ieee80211_bss_conf *link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); /* Need to have a beacon head if we don't have one yet */ if (!params->head && !old) return -EINVAL; /* new or old head? */ if (params->head) new_head_len = params->head_len; else new_head_len = old->head_len; /* new or old tail? */ if (params->tail || !old) /* params->tail_len will be zero for !params->tail */ new_tail_len = params->tail_len; else new_tail_len = old->tail_len; size = sizeof(*new) + new_head_len + new_tail_len; /* new or old multiple BSSID elements? */ if (params->mbssid_ies) { mbssid = params->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (params->rnr_ies) { rnr = params->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } else if (old && old->mbssid_ies) { mbssid = old->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (old && old->rnr_ies) { rnr = old->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } new = kzalloc(size, GFP_KERNEL); if (!new) return -ENOMEM; /* start filling the new info now */ /* * pointers go into the block we allocated, * memory is | beacon_data | head | tail | mbssid_ies | rnr_ies */ new->head = ((u8 *) new) + sizeof(*new); new->tail = new->head + new_head_len; new->head_len = new_head_len; new->tail_len = new_tail_len; /* copy in optional mbssid_ies */ if (mbssid) { u8 *pos = new->tail + new->tail_len; new->mbssid_ies = (void *)pos; pos += struct_size(new->mbssid_ies, elem, mbssid->cnt); pos += ieee80211_copy_mbssid_beacon(pos, new->mbssid_ies, mbssid); if (rnr) { new->rnr_ies = (void *)pos; pos += struct_size(new->rnr_ies, elem, rnr->cnt); ieee80211_copy_rnr_beacon(pos, new->rnr_ies, rnr); } /* update bssid_indicator */ link_conf->bssid_indicator = ilog2(__roundup_pow_of_two(mbssid->cnt + 1)); } if (csa) { new->cntdwn_current_counter = csa->count; memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * sizeof(new->cntdwn_counter_offsets[0])); } else if (cca) { new->cntdwn_current_counter = cca->count; new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon; } /* copy in head */ if (params->head) memcpy(new->head, params->head, new_head_len); else memcpy(new->head, old->head, new_head_len); /* copy in optional tail */ if (params->tail) memcpy(new->tail, params->tail, new_tail_len); else if (old) memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, params->probe_resp_len, csa, cca, link); if (err < 0) { kfree(new); return err; } if (err == 0) _changed |= BSS_CHANGED_AP_PROBE_RESP; if (params->ftm_responder != -1) { link_conf->ftm_responder = params->ftm_responder; err = ieee80211_set_ftm_responder_params(sdata, params->lci, params->lci_len, params->civicloc, params->civicloc_len, link_conf); if (err < 0) { kfree(new); return err; } _changed |= BSS_CHANGED_FTM_RESPONDER; } rcu_assign_pointer(link->u.ap.beacon, new); sdata->u.ap.active = true; if (old) kfree_rcu(old, rcu_head); *changed |= _changed; return 0; } static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct beacon_data *old; struct ieee80211_sub_if_data *vlan; u64 changed = BSS_CHANGED_BEACON_INT | BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_BEACON | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | BSS_CHANGED_TWT; int i, err; int prev_beacon_int; unsigned int link_id = params->beacon.link_id; struct ieee80211_link_data *link; struct ieee80211_bss_conf *link_conf; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); if (old) return -EALREADY; if (params->smps_mode != NL80211_SMPS_OFF) return -ENOTSUPP; link->smps_mode = IEEE80211_SMPS_OFF; link->needed_rx_chains = sdata->local->rx_chains; prev_beacon_int = link_conf->beacon_int; link_conf->beacon_int = params->beacon_interval; if (params->ht_cap) link_conf->ht_ldpc = params->ht_cap->cap_info & cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING); if (params->vht_cap) { link_conf->vht_ldpc = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC); link_conf->vht_su_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); link_conf->vht_su_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); link_conf->vht_mu_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE); link_conf->vht_mu_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); } if (params->he_cap && params->he_oper) { link_conf->he_support = true; link_conf->htc_trig_based_pkt_ext = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); link_conf->frame_time_rts_th = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); changed |= BSS_CHANGED_HE_OBSS_PD; if (params->beacon.he_bss_color.enabled) changed |= BSS_CHANGED_HE_BSS_COLOR; } if (params->he_cap) { link_conf->he_ldpc = params->he_cap->phy_cap_info[1] & IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD; link_conf->he_su_beamformer = params->he_cap->phy_cap_info[3] & IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER; link_conf->he_su_beamformee = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE; link_conf->he_mu_beamformer = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER; link_conf->he_full_ul_mumimo = params->he_cap->phy_cap_info[2] & IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO; } if (params->eht_cap) { if (!link_conf->he_support) return -EOPNOTSUPP; link_conf->eht_support = true; link_conf->eht_puncturing = params->punct_bitmap; changed |= BSS_CHANGED_EHT_PUNCTURING; link_conf->eht_su_beamformer = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER; link_conf->eht_su_beamformee = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE; link_conf->eht_mu_beamformer = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; link_conf->eht_mu_beamformer = false; } if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, params->mbssid_config, link_conf); if (err) return err; } mutex_lock(&local->mtx); err = ieee80211_link_use_channel(link, &params->chandef, IEEE80211_CHANCTX_SHARED); if (!err) ieee80211_link_copy_chanctx_to_vlans(link, false); mutex_unlock(&local->mtx); if (err) { link_conf->beacon_int = prev_beacon_int; return err; } /* * Apply control port protocol, this allows us to * not encrypt dynamic WEP control frames. */ sdata->control_port_protocol = params->crypto.control_port_ethertype; sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; sdata->control_port_no_preauth = params->crypto.control_port_no_preauth; list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->control_port_protocol = params->crypto.control_port_ethertype; vlan->control_port_no_encrypt = params->crypto.control_port_no_encrypt; vlan->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; vlan->control_port_no_preauth = params->crypto.control_port_no_preauth; } link_conf->dtim_period = params->dtim_period; link_conf->enable_beacon = true; link_conf->allow_p2p_go_ps = sdata->vif.p2p; link_conf->twt_responder = params->twt_responder; link_conf->he_obss_pd = params->he_obss_pd; link_conf->he_bss_color = params->beacon.he_bss_color; sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; sdata->vif.cfg.ssid_len = params->ssid_len; if (params->ssid_len) memcpy(sdata->vif.cfg.ssid, params->ssid, params->ssid_len); link_conf->hidden_ssid = (params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE); memset(&link_conf->p2p_noa_attr, 0, sizeof(link_conf->p2p_noa_attr)); link_conf->p2p_noa_attr.oppps_ctwindow = params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; if (params->p2p_opp_ps) link_conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = params->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) link_conf->beacon_tx_rate = params->beacon_rate; err = ieee80211_assign_beacon(sdata, link, &params->beacon, NULL, NULL, &changed); if (err < 0) goto error; if (params->fils_discovery.max_interval) { err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf); if (err < 0) goto error; changed |= BSS_CHANGED_FILS_DISCOVERY; } if (params->unsol_bcast_probe_resp.interval) { err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf); if (err < 0) goto error; changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; } err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { old = sdata_dereference(link->u.ap.beacon, sdata); if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(link->u.ap.beacon, NULL); sdata->u.ap.active = false; goto error; } ieee80211_recalc_dtim(local, sdata); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); ieee80211_link_info_change_notify(sdata, link, changed); netif_carrier_on(dev); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_on(vlan->dev); return 0; error: mutex_lock(&local->mtx); ieee80211_link_release_channel(link); mutex_unlock(&local->mtx); return err; } static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_beacon_data *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct beacon_data *old; int err; struct ieee80211_bss_conf *link_conf; u64 changed = 0; sdata_assert_lock(sdata); link = sdata_dereference(sdata->link[params->link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; /* don't allow changing the beacon while a countdown is in place - offset * of channel switch counter may change */ if (link_conf->csa_active || link_conf->color_change_active) return -EBUSY; old = sdata_dereference(link->u.ap.beacon, sdata); if (!old) return -ENOENT; err = ieee80211_assign_beacon(sdata, link, params, NULL, NULL, &changed); if (err < 0) return err; if (params->he_bss_color_valid && params->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = params->he_bss_color.enabled; changed |= BSS_CHANGED_HE_BSS_COLOR; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static void ieee80211_free_next_beacon(struct ieee80211_link_data *link) { if (!link->u.ap.next_beacon) return; kfree(link->u.ap.next_beacon->mbssid_ies); kfree(link->u.ap.next_beacon->rnr_ies); kfree(link->u.ap.next_beacon); link->u.ap.next_beacon = NULL; } static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_sub_if_data *vlan; struct ieee80211_local *local = sdata->local; struct beacon_data *old_beacon; struct probe_resp *old_probe_resp; struct fils_discovery_data *old_fils_discovery; struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct ieee80211_bss_conf *link_conf = link->conf; sdata_assert_lock(sdata); old_beacon = sdata_dereference(link->u.ap.beacon, sdata); if (!old_beacon) return -ENOENT; old_probe_resp = sdata_dereference(link->u.ap.probe_resp, sdata); old_fils_discovery = sdata_dereference(link->u.ap.fils_discovery, sdata); old_unsol_bcast_probe_resp = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); /* abort any running channel switch or color change */ mutex_lock(&local->mtx); link_conf->csa_active = false; link_conf->color_change_active = false; if (link->csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); link->csa_block_tx = false; } mutex_unlock(&local->mtx); ieee80211_free_next_beacon(link); /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); netif_carrier_off(dev); /* remove beacon and probe response */ sdata->u.ap.active = false; RCU_INIT_POINTER(link->u.ap.beacon, NULL); RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); if (old_fils_discovery) kfree_rcu(old_fils_discovery, rcu_head); if (old_unsol_bcast_probe_resp) kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; __sta_info_flush(sdata, true); ieee80211_free_keys(sdata, true); link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); if (sdata->wdev.cac_started) { chandef = link_conf->chandef; cancel_delayed_work_sync(&link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL); } drv_stop_ap(sdata->local, sdata, link_conf); /* free all potentially still buffered bcast frames */ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf); mutex_lock(&local->mtx); ieee80211_link_copy_chanctx_to_vlans(link, true); ieee80211_link_release_channel(link); mutex_unlock(&local->mtx); return 0; } static int sta_apply_auth_flags(struct ieee80211_local *local, struct sta_info *sta, u32 mask, u32 set) { int ret; if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && set & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && set & BIT(NL80211_STA_FLAG_ASSOCIATED) && !test_sta_flag(sta, WLAN_STA_ASSOC)) { /* * When peer becomes associated, init rate control as * well. Some drivers require rate control initialized * before drv_sta_state() is called. */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); else ret = 0; if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && !(set & BIT(NL80211_STA_FLAG_ASSOCIATED)) && test_sta_flag(sta, WLAN_STA_ASSOC)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) && test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_NONE); if (ret) return ret; } return 0; } static void sta_apply_mesh_params(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { #ifdef CONFIG_MAC80211_MESH struct ieee80211_sub_if_data *sdata = sta->sdata; u64 changed = 0; if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { case NL80211_PLINK_ESTAB: if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, sdata->u.mesh.mshcfg.power_mode); ewma_mesh_tx_rate_avg_init(&sta->mesh->tx_rate_avg); /* init at low value */ ewma_mesh_tx_rate_avg_add(&sta->mesh->tx_rate_avg, 10); break; case NL80211_PLINK_LISTEN: case NL80211_PLINK_BLOCKED: case NL80211_PLINK_OPN_SNT: case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: case NL80211_PLINK_HOLDING: if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->mesh->plink_state = params->plink_state; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, NL80211_MESH_POWER_UNKNOWN); break; default: /* nothing */ break; } } switch (params->plink_action) { case NL80211_PLINK_ACTION_NO_ACTION: /* nothing */ break; case NL80211_PLINK_ACTION_OPEN: changed |= mesh_plink_open(sta); break; case NL80211_PLINK_ACTION_BLOCK: changed |= mesh_plink_block(sta); break; } if (params->local_pm) changed |= ieee80211_mps_set_sta_local_pm(sta, params->local_pm); ieee80211_mbss_info_change_notify(sdata, changed); #endif } static int sta_link_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, bool new_link, struct link_station_parameters *params) { int ret = 0; struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; u32 link_id = params->link_id < 0 ? 0 : params->link_id; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct link_sta_info *link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->sta_mtx)); /* * If there are no changes, then accept a link that doesn't exist, * unless it's a new link. */ if (params->link_id < 0 && !new_link && !params->link_mac && !params->txpwr_set && !params->supported_rates_len && !params->ht_capa && !params->vht_capa && !params->he_capa && !params->eht_capa && !params->opmode_notif_used) return 0; if (!link || !link_sta) return -EINVAL; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->link_mac) { if (new_link) { memcpy(link_sta->addr, params->link_mac, ETH_ALEN); memcpy(link_sta->pub->addr, params->link_mac, ETH_ALEN); } else if (!ether_addr_equal(link_sta->addr, params->link_mac)) { return -EINVAL; } } else if (new_link) { return -EINVAL; } if (params->txpwr_set) { link_sta->pub->txpwr.type = params->txpwr.type; if (params->txpwr.type == NL80211_TX_POWER_LIMITED) link_sta->pub->txpwr.power = params->txpwr.power; ret = drv_sta_set_txpwr(local, sdata, sta); if (ret) return ret; } if (params->supported_rates && params->supported_rates_len) { ieee80211_parse_bitrates(link->conf->chandef.width, sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band]); } if (params->ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, params->ht_capa, link_sta); /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, NULL, link_sta); if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, params->he_capa_len, (void *)params->he_6ghz_capa, link_sta); if (params->he_capa && params->eht_capa) ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, (u8 *)params->he_capa, params->he_capa_len, params->eht_capa, params->eht_capa_len, link_sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ __ieee80211_vht_handle_opmode(sdata, link_sta, params->opmode_notif, sband->band); } return ret; } static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 mask, set; int ret = 0; mask = params->sta_flags_mask; set = params->sta_flags_set; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* * In mesh mode, ASSOCIATED isn't part of the nl80211 * API but must follow AUTHENTICATED for driver state. */ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) mask |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) set |= BIT(NL80211_STA_FLAG_ASSOCIATED); } else if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* * TDLS -- everything follows authorized, but * only becoming authorized is possible, not * going back */ if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) { set |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); mask |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); } } if (mask & BIT(NL80211_STA_FLAG_WME) && local->hw.queues >= IEEE80211_NUM_ACS) sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); /* auth flags will be set later for TDLS, * and for unassociated stations that move to associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) set_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); else clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } if (mask & BIT(NL80211_STA_FLAG_MFP)) { sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP)); if (set & BIT(NL80211_STA_FLAG_MFP)) set_sta_flag(sta, WLAN_STA_MFP); else clear_sta_flag(sta, WLAN_STA_MFP); } if (mask & BIT(NL80211_STA_FLAG_TDLS_PEER)) { if (set & BIT(NL80211_STA_FLAG_TDLS_PEER)) set_sta_flag(sta, WLAN_STA_TDLS_PEER); else clear_sta_flag(sta, WLAN_STA_TDLS_PEER); } /* mark TDLS channel switch support, if the AP allows it */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->deflink.u.mgd.tdls_chan_switch_prohibited && params->ext_capab_len >= 4 && params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH) set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->u.mgd.tdls_wider_bw_prohibited && ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && params->ext_capab_len >= 8 && params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW); if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) { sta->sta.uapsd_queues = params->uapsd_queues; sta->sta.max_sp = params->max_sp; } ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, params->ext_capab_len); /* * cfg80211 validates this (1-2007) and allows setting the AID * only when creating a new station entry */ if (params->aid) sta->sta.aid = params->aid; /* * Some of the following updates would be racy if called on an * existing station, via ieee80211_change_station(). However, * all such changes are rejected by cfg80211 except for updates * changing the supported rates on an existing but not yet used * TDLS peer. */ if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; ret = sta_link_apply_parameters(local, sta, false, &params->link_sta_params); if (ret) return ret; if (params->support_p2p_ps >= 0) sta->sta.support_p2p_ps = params->support_p2p_ps; if (ieee80211_vif_is_mesh(&sdata->vif)) sta_apply_mesh_params(local, sta, params); if (params->airtime_weight) sta->airtime_weight = params->airtime_weight; /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || set & BIT(NL80211_STA_FLAG_ASSOCIATED)) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } /* Mark the STA as MLO if MLD MAC address is available */ if (params->link_sta_params.mld_mac) sta->sta.mlo = true; return 0; } static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *sdata; int err; if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_AP) return -EINVAL; } else sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (ether_addr_equal(mac, sdata->vif.addr)) return -EINVAL; if (!is_valid_ether_addr(mac)) return -EINVAL; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) && sdata->vif.type == NL80211_IFTYPE_STATION && !sdata->u.mgd.associated) return -EINVAL; /* * If we have a link ID, it can be a non-MLO station on an AP MLD, * but we need to have a link_mac in that case as well, so use the * STA's MAC address in that case. */ if (params->link_sta_params.link_id >= 0) sta = sta_info_alloc_with_link(sdata, mac, params->link_sta_params.link_id, params->link_sta_params.link_mac ?: mac, GFP_KERNEL); else sta = sta_info_alloc(sdata, mac, GFP_KERNEL); if (!sta) return -ENOMEM; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; /* Though the mutex is not needed here (since the station is not * visible yet), sta_apply_parameters (and inner functions) require * the mutex due to other paths. */ mutex_lock(&local->sta_mtx); err = sta_apply_parameters(local, sta, params); mutex_unlock(&local->sta_mtx); if (err) { sta_info_free(local, sta); return err; } /* * for TDLS and for unassociated station, rate control should be * initialized only when rates are known and station is marked * authorized/associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); return sta_info_insert(sta); } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, struct station_del_parameters *params) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (params->mac) return sta_info_destroy_addr_bss(sdata, params->mac); sta_info_flush(sdata); return 0; } static int ieee80211_change_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *vlansdata; enum cfg80211_station_type statype; int err; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mac); if (!sta) { err = -ENOENT; goto out_err; } switch (sdata->vif.type) { case NL80211_IFTYPE_MESH_POINT: if (sdata->u.mesh.user_mpm) statype = CFG80211_STA_MESH_PEER_USER; else statype = CFG80211_STA_MESH_PEER_KERNEL; break; case NL80211_IFTYPE_ADHOC: statype = CFG80211_STA_IBSS; break; case NL80211_IFTYPE_STATION: if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { statype = CFG80211_STA_AP_STA; break; } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) statype = CFG80211_STA_TDLS_PEER_ACTIVE; else statype = CFG80211_STA_TDLS_PEER_SETUP; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: if (test_sta_flag(sta, WLAN_STA_ASSOC)) statype = CFG80211_STA_AP_CLIENT; else statype = CFG80211_STA_AP_CLIENT_UNASSOC; break; default: err = -EOPNOTSUPP; goto out_err; } err = cfg80211_check_station_change(wiphy, params, statype); if (err) goto out_err; if (params->vlan && params->vlan != sta->sdata->dev) { vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (params->vlan->ieee80211_ptr->use_4addr) { if (vlansdata->u.vlan.sta) { err = -EBUSY; goto out_err; } rcu_assign_pointer(vlansdata->u.vlan.sta, sta); __ieee80211_check_fast_rx_iface(vlansdata); drv_sta_set_4addr(local, sta->sdata, &sta->sta, true); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sta->sdata->u.vlan.sta) { ieee80211_clear_fast_rx(sta); RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_dec_num_mcast(sta->sdata); sta->sdata = vlansdata; ieee80211_check_fast_xmit(sta); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { ieee80211_vif_inc_num_mcast(sta->sdata); cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); } } /* we use sta_info_get_bss() so this might be different */ if (sdata != sta->sdata) { mutex_lock_nested(&sta->sdata->wdev.mtx, 1); err = sta_apply_parameters(local, sta, params); mutex_unlock(&sta->sdata->wdev.mtx); } else { err = sta_apply_parameters(local, sta, params); } if (err) goto out_err; mutex_unlock(&local->sta_mtx); if (sdata->vif.type == NL80211_IFTYPE_STATION && params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } return 0; out_err: mutex_unlock(&local->sta_mtx); return err; } #ifdef CONFIG_MAC80211_MESH static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_add(sdata, dst); if (IS_ERR(mpath)) { rcu_read_unlock(); return PTR_ERR(mpath); } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (dst) return mesh_path_del(sdata, dst); mesh_path_flush_by_iface(sdata); return 0; } static int ieee80211_change_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, struct mpath_info *pinfo) { struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop); if (next_hop_sta) memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN); else eth_zero_addr(next_hop); memset(pinfo, 0, sizeof(*pinfo)); pinfo->generation = mpath->sdata->u.mesh.mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | MPATH_INFO_SN | MPATH_INFO_METRIC | MPATH_INFO_EXPTIME | MPATH_INFO_DISCOVERY_TIMEOUT | MPATH_INFO_DISCOVERY_RETRIES | MPATH_INFO_FLAGS | MPATH_INFO_HOP_COUNT | MPATH_INFO_PATH_CHANGE; pinfo->frame_qlen = mpath->frame_queue.qlen; pinfo->sn = mpath->sn; pinfo->metric = mpath->metric; if (time_before(jiffies, mpath->exp_time)) pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies); pinfo->discovery_timeout = jiffies_to_msecs(mpath->discovery_timeout); pinfo->discovery_retries = mpath->discovery_retries; if (mpath->flags & MESH_PATH_ACTIVE) pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; if (mpath->flags & MESH_PATH_RESOLVING) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; if (mpath->flags & MESH_PATH_SN_VALID) pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; if (mpath->flags & MESH_PATH_FIXED) pinfo->flags |= NL80211_MPATH_FLAG_FIXED; if (mpath->flags & MESH_PATH_RESOLVED) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; pinfo->hop_count = mpath->hop_count; pinfo->path_change_count = mpath->path_change_count; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp, struct mpath_info *pinfo) { memset(pinfo, 0, sizeof(*pinfo)); memcpy(mpp, mpath->mpp, ETH_ALEN); pinfo->generation = mpath->sdata->u.mesh.mpp_paths_generation; } static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpp(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_get_mesh_config(struct wiphy *wiphy, struct net_device *dev, struct mesh_config *conf) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config)); return 0; } static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask) { return (mask >> (parm-1)) & 0x1; } static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const struct mesh_setup *setup) { u8 *new_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); int i; /* allocate information elements */ new_ie = NULL; if (setup->ie_len) { new_ie = kmemdup(setup->ie, setup->ie_len, GFP_KERNEL); if (!new_ie) return -ENOMEM; } ifmsh->ie_len = setup->ie_len; ifmsh->ie = new_ie; /* now copy the rest of the setup parameters */ ifmsh->mesh_id_len = setup->mesh_id_len; memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len); ifmsh->mesh_sp_id = setup->sync_method; ifmsh->mesh_pp_id = setup->path_sel_proto; ifmsh->mesh_pm_id = setup->path_metric; ifmsh->user_mpm = setup->user_mpm; ifmsh->mesh_auth_id = setup->auth_id; ifmsh->security = IEEE80211_MESH_SEC_NONE; ifmsh->userspace_handles_dfs = setup->userspace_handles_dfs; if (setup->is_authenticated) ifmsh->security |= IEEE80211_MESH_SEC_AUTHED; if (setup->is_secure) ifmsh->security |= IEEE80211_MESH_SEC_SECURED; /* mcast rate setting in Mesh Node */ memcpy(sdata->vif.bss_conf.mcast_rate, setup->mcast_rate, sizeof(setup->mcast_rate)); sdata->vif.bss_conf.basic_rates = setup->basic_rates; sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = setup->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } return 0; } static int ieee80211_update_mesh_config(struct wiphy *wiphy, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { struct mesh_config *conf; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_mesh *ifmsh; sdata = IEEE80211_DEV_TO_SUB_IF(dev); ifmsh = &sdata->u.mesh; /* Set the config options which we are interested in setting */ conf = &(sdata->u.mesh.mshcfg); if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask)) conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask)) conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask)) conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask)) conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask)) conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries; if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask)) conf->dot11MeshTTL = nconf->dot11MeshTTL; if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask)) conf->element_ttl = nconf->element_ttl; if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) { if (ifmsh->user_mpm) return -EBUSY; conf->auto_open_plinks = nconf->auto_open_plinks; } if (_chg_mesh_attr(NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, mask)) conf->dot11MeshNbrOffsetMaxNeighbor = nconf->dot11MeshNbrOffsetMaxNeighbor; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask)) conf->dot11MeshHWMPmaxPREQretries = nconf->dot11MeshHWMPmaxPREQretries; if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask)) conf->path_refresh_time = nconf->path_refresh_time; if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask)) conf->min_discovery_timeout = nconf->min_discovery_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathTimeout = nconf->dot11MeshHWMPactivePathTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask)) conf->dot11MeshHWMPpreqMinInterval = nconf->dot11MeshHWMPpreqMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, mask)) conf->dot11MeshHWMPperrMinInterval = nconf->dot11MeshHWMPperrMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, mask)) conf->dot11MeshHWMPnetDiameterTraversalTime = nconf->dot11MeshHWMPnetDiameterTraversalTime; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) { conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode; ieee80211_mesh_root_setup(ifmsh); } if (_chg_mesh_attr(NL80211_MESHCONF_GATE_ANNOUNCEMENTS, mask)) { /* our current gate announcement implementation rides on root * announcements, so require this ifmsh to also be a root node * */ if (nconf->dot11MeshGateAnnouncementProtocol && !(conf->dot11MeshHWMPRootMode > IEEE80211_ROOTMODE_ROOT)) { conf->dot11MeshHWMPRootMode = IEEE80211_PROACTIVE_RANN; ieee80211_mesh_root_setup(ifmsh); } conf->dot11MeshGateAnnouncementProtocol = nconf->dot11MeshGateAnnouncementProtocol; } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_RANN_INTERVAL, mask)) conf->dot11MeshHWMPRannInterval = nconf->dot11MeshHWMPRannInterval; if (_chg_mesh_attr(NL80211_MESHCONF_FORWARDING, mask)) conf->dot11MeshForwarding = nconf->dot11MeshForwarding; if (_chg_mesh_attr(NL80211_MESHCONF_RSSI_THRESHOLD, mask)) { /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -ENOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) { conf->ht_opmode = nconf->ht_opmode; sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_HT); } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathToRootTimeout = nconf->dot11MeshHWMPactivePathToRootTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOT_INTERVAL, mask)) conf->dot11MeshHWMProotInterval = nconf->dot11MeshHWMProotInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, mask)) conf->dot11MeshHWMPconfirmationInterval = nconf->dot11MeshHWMPconfirmationInterval; if (_chg_mesh_attr(NL80211_MESHCONF_POWER_MODE, mask)) { conf->power_mode = nconf->power_mode; ieee80211_mps_local_status_update(sdata); } if (_chg_mesh_attr(NL80211_MESHCONF_AWAKE_WINDOW, mask)) conf->dot11MeshAwakeWindowDuration = nconf->dot11MeshAwakeWindowDuration; if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask)) conf->plink_timeout = nconf->plink_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) conf->dot11MeshConnectedToAuthServer = nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; int err; memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config)); err = copy_mesh_setup(ifmsh, setup); if (err) return err; sdata->control_port_over_nl80211 = setup->control_port_over_nl80211; /* can mesh use other SMPS modes? */ sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; mutex_lock(&sdata->local->mtx); err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef, IEEE80211_CHANCTX_SHARED); mutex_unlock(&sdata->local->mtx); if (err) return err; return ieee80211_start_mesh(sdata); } static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); ieee80211_stop_mesh(sdata); mutex_lock(&sdata->local->mtx); ieee80211_link_release_channel(&sdata->deflink); kfree(sdata->u.mesh.ie); mutex_unlock(&sdata->local->mtx); return 0; } #endif static int ieee80211_change_bss(struct wiphy *wiphy, struct net_device *dev, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; u64 changed = 0; link = ieee80211_link_or_deflink(sdata, params->link_id, true); if (IS_ERR(link)) return PTR_ERR(link); if (!sdata_dereference(link->u.ap.beacon, sdata)) return -ENOENT; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->basic_rates) { if (!ieee80211_parse_bitrates(link->conf->chandef.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, &link->conf->basic_rates)) return -EINVAL; changed |= BSS_CHANGED_BASIC_RATES; ieee80211_check_rate_mask(link); } if (params->use_cts_prot >= 0) { link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (params->use_short_preamble >= 0) { link->conf->use_short_preamble = params->use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (!link->conf->use_short_slot && (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ)) { link->conf->use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } if (params->use_short_slot_time >= 0) { link->conf->use_short_slot = params->use_short_slot_time; changed |= BSS_CHANGED_ERP_SLOT; } if (params->ap_isolate >= 0) { if (params->ap_isolate) sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; else sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS; ieee80211_check_fast_rx_iface(sdata); } if (params->ht_opmode >= 0) { link->conf->ht_operation_mode = (u16)params->ht_opmode; changed |= BSS_CHANGED_HT; } if (params->p2p_ctwindow >= 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; link->conf->p2p_noa_attr.oppps_ctwindow |= params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } if (params->p2p_opp_ps > 0) { link->conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } else if (params->p2p_opp_ps == 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static int ieee80211_set_txq_params(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_txq_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, params->link_id, true); struct ieee80211_tx_queue_params p; if (!local->ops->conf_tx) return -EOPNOTSUPP; if (local->hw.queues < IEEE80211_NUM_ACS) return -EOPNOTSUPP; if (IS_ERR(link)) return PTR_ERR(link); memset(&p, 0, sizeof(p)); p.aifs = params->aifs; p.cw_max = params->cwmax; p.cw_min = params->cwmin; p.txop = params->txop; /* * Setting tx queue params disables u-apsd because it's only * called in master mode. */ p.uapsd = false; ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac); link->tx_conf[params->ac] = p; if (drv_conf_tx(local, link, params->ac, &p)) { wiphy_debug(local->hw.wiphy, "failed to set TX queue parameters for AC %d\n", params->ac); return -EINVAL; } ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); return 0; } #ifdef CONFIG_PM static int ieee80211_suspend(struct wiphy *wiphy, struct cfg80211_wowlan *wowlan) { return __ieee80211_suspend(wiphy_priv(wiphy), wowlan); } static int ieee80211_resume(struct wiphy *wiphy) { return __ieee80211_resume(wiphy_priv(wiphy)); } #else #define ieee80211_suspend NULL #define ieee80211_resume NULL #endif static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); switch (ieee80211_vif_type_p2p(&sdata->vif)) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_DEVICE: break; case NL80211_IFTYPE_P2P_GO: if (sdata->local->ops->hw_scan) break; /* * FIXME: implement NoA while scanning in software, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ fallthrough; case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports * forcing), don't care about being beaconing already. * This will create problems to the attached stations (e.g. all * the frames sent while scanning on other channel will be * lost) */ if (sdata->deflink.u.ap.beacon && (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } return ieee80211_request_scan(sdata, req); } static void ieee80211_abort_scan(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_scan_cancel(wiphy_priv(wiphy)); } static int ieee80211_sched_scan_start(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_sched_scan_request *req) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!sdata->local->ops->sched_scan_start) return -EOPNOTSUPP; return ieee80211_request_sched_scan_start(sdata, req); } static int ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev, u64 reqid) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; return ieee80211_request_sched_scan_stop(local); } static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req) { return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_assoc_request *req) { return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_deauth_request *req) { return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_disassoc_request *req) { return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ibss_params *params) { return ieee80211_ibss_join(IEEE80211_DEV_TO_SUB_IF(dev), params); } static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ibss_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_join_ocb(struct wiphy *wiphy, struct net_device *dev, struct ocb_setup *setup) { return ieee80211_ocb_join(IEEE80211_DEV_TO_SUB_IF(dev), setup); } static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ocb_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MCAST_RATE); return 0; } static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); err = drv_set_frag_threshold(local, wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); return err; } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || (changed & WIPHY_PARAM_DYN_ACK)) { s16 coverage_class; coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; err = drv_set_coverage_class(local, coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { err = drv_set_rts_threshold(local, wiphy->rts_threshold); if (err) return err; } if (changed & WIPHY_PARAM_RETRY_SHORT) { if (wiphy->retry_short > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; } if (changed & WIPHY_PARAM_RETRY_LONG) { if (wiphy->retry_long > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) ieee80211_txq_set_params(local); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; enum nl80211_tx_power_setting txp_type = type; bool update_txp_type = false; bool has_monitor = false; if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) return -EOPNOTSUPP; } switch (type) { case NL80211_TX_POWER_AUTOMATIC: sdata->deflink.user_power_level = IEEE80211_UNSET_POWER_LEVEL; txp_type = NL80211_TX_POWER_LIMITED; break; case NL80211_TX_POWER_LIMITED: case NL80211_TX_POWER_FIXED: if (mbm < 0 || (mbm % 100)) return -EOPNOTSUPP; sdata->deflink.user_power_level = MBM_TO_DBM(mbm); break; } if (txp_type != sdata->vif.bss_conf.txpower_type) { update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; } ieee80211_recalc_txpower(sdata, update_txp_type); return 0; } switch (type) { case NL80211_TX_POWER_AUTOMATIC: local->user_power_level = IEEE80211_UNSET_POWER_LEVEL; txp_type = NL80211_TX_POWER_LIMITED; break; case NL80211_TX_POWER_LIMITED: case NL80211_TX_POWER_FIXED: if (mbm < 0 || (mbm % 100)) return -EOPNOTSUPP; local->user_power_level = MBM_TO_DBM(mbm); break; } mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { has_monitor = true; continue; } sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) continue; ieee80211_recalc_txpower(sdata, update_txp_type); } mutex_unlock(&local->iflist_mtx); if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; ieee80211_recalc_txpower(sdata, update_txp_type); } } return 0; } static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, int *dbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (local->ops->get_txpower) return drv_get_txpower(local, sdata, dbm); if (!local->use_chanctx) *dbm = local->hw.conf.power_level; else *dbm = sdata->vif.bss_conf.txpower; return 0; } static void ieee80211_rfkill_poll(struct wiphy *wiphy) { struct ieee80211_local *local = wiphy_priv(wiphy); drv_rfkill_poll(local); } #ifdef CONFIG_NL80211_TESTMODE static int ieee80211_testmode_cmd(struct wiphy *wiphy, struct wireless_dev *wdev, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_vif *vif = NULL; if (!local->ops->testmode_cmd) return -EOPNOTSUPP; if (wdev) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->flags & IEEE80211_SDATA_IN_DRIVER) vif = &sdata->vif; } return local->ops->testmode_cmd(&local->hw, vif, data, len); } static int ieee80211_testmode_dump(struct wiphy *wiphy, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->testmode_dump) return -EOPNOTSUPP; return local->ops->testmode_dump(&local->hw, skb, cb, data, len); } #endif int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode) { const u8 *ap; enum ieee80211_smps_mode old_req; int err; struct sta_info *sta; bool tdls_peer_found = false; lockdep_assert_held(&sdata->wdev.mtx); if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return -EINVAL; old_req = link->u.mgd.req_smps; link->u.mgd.req_smps = smps_mode; if (old_req == smps_mode && smps_mode != IEEE80211_SMPS_AUTOMATIC) return 0; /* * If not associated, or current association is not an HT * association, there's no need to do anything, just store * the new value until we associate. */ if (!sdata->u.mgd.associated || link->conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) return 0; ap = link->u.mgd.bssid; rcu_read_lock(); list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) continue; tdls_peer_found = true; break; } rcu_read_unlock(); if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { if (tdls_peer_found || !sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_OFF; else smps_mode = IEEE80211_SMPS_DYNAMIC; } /* send SM PS frame to AP */ err = ieee80211_send_smps_action(sdata, smps_mode, ap, ap); if (err) link->u.mgd.req_smps = old_req; else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) ieee80211_teardown_tdls_peers(sdata); return err; } static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); unsigned int link_id; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && timeout == local->dynamic_ps_forced_timeout) return 0; sdata->u.mgd.powersave = enabled; local->dynamic_ps_forced_timeout = timeout; /* no change, but if automatic follow powersave */ sdata_lock(sdata); for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; __ieee80211_request_smps_mgd(sdata, link, link->u.mgd.req_smps); } sdata_unlock(sdata); if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); ieee80211_check_fast_rx_iface(sdata); return 0; } static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; if (rssi_thold == bss_conf->cqm_rssi_thold && rssi_hyst == bss_conf->cqm_rssi_hyst) return 0; if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER && !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) return -EOPNOTSUPP; bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; bss_conf->cqm_rssi_low = 0; bss_conf->cqm_rssi_high = 0; sdata->deflink.u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_CQM); return 0; } static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_low, s32 rssi_high) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return -EOPNOTSUPP; bss_conf->cqm_rssi_low = rssi_low; bss_conf->cqm_rssi_high = rssi_high; bss_conf->cqm_rssi_thold = 0; bss_conf->cqm_rssi_hyst = 0; sdata->deflink.u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_CQM); return 0; } static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, const u8 *addr, const struct cfg80211_bitrate_mask *mask) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); int i, ret; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; /* * If active validate the setting and reject it if it doesn't leave * at least one basic rate usable, since we really have to be able * to send something, and if we're an AP we have to be able to do * so at a basic rate so that all clients can receive it. */ if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) && sdata->vif.bss_conf.chandef.chan) { u32 basic_rates = sdata->vif.bss_conf.basic_rates; enum nl80211_band band = sdata->vif.bss_conf.chandef.chan->band; if (!(mask->control[band].legacy & basic_rates)) return -EINVAL; } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; } for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, sizeof(mask->control[i].ht_mcs)); memcpy(sdata->rc_rateidx_vht_mcs_mask[i], mask->control[i].vht_mcs, sizeof(mask->control[i].vht_mcs)); sdata->rc_has_mcs_mask[i] = false; sdata->rc_has_vht_mcs_mask[i] = false; if (!sband) continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) { sdata->rc_has_mcs_mask[i] = true; break; } } for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) { sdata->rc_has_vht_mcs_mask[i] = true; break; } } } return 0; } static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; int err; mutex_lock(&local->mtx); if (!list_empty(&local->roc_list) || local->scanning) { err = -EBUSY; goto out_unlock; } /* whatever, but channel contexts should not complain about that one */ sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = local->rx_chains; err = ieee80211_link_use_channel(&sdata->deflink, chandef, IEEE80211_CHANCTX_SHARED); if (err) goto out_unlock; ieee80211_queue_delayed_work(&sdata->local->hw, &sdata->deflink.dfs_cac_timer_work, msecs_to_jiffies(cac_time_ms)); out_unlock: mutex_unlock(&local->mtx); return err; } static void ieee80211_end_cac(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; mutex_lock(&local->mtx); list_for_each_entry(sdata, &local->interfaces, list) { /* it might be waiting for the local->mtx, but then * by the time it gets it, sdata->wdev.cac_started * will no longer be true */ cancel_delayed_work(&sdata->deflink.dfs_cac_timer_work); if (sdata->wdev.cac_started) { ieee80211_link_release_channel(&sdata->deflink); sdata->wdev.cac_started = false; } } mutex_unlock(&local->mtx); } static struct cfg80211_beacon_data * cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) { struct cfg80211_beacon_data *new_beacon; u8 *pos; int len; len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; if (beacon->mbssid_ies) len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies, beacon->rnr_ies, beacon->mbssid_ies->cnt); new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) return NULL; if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { new_beacon->mbssid_ies = kzalloc(struct_size(new_beacon->mbssid_ies, elem, beacon->mbssid_ies->cnt), GFP_KERNEL); if (!new_beacon->mbssid_ies) { kfree(new_beacon); return NULL; } if (beacon->rnr_ies && beacon->rnr_ies->cnt) { new_beacon->rnr_ies = kzalloc(struct_size(new_beacon->rnr_ies, elem, beacon->rnr_ies->cnt), GFP_KERNEL); if (!new_beacon->rnr_ies) { kfree(new_beacon->mbssid_ies); kfree(new_beacon); return NULL; } } } pos = (u8 *)(new_beacon + 1); if (beacon->head_len) { new_beacon->head_len = beacon->head_len; new_beacon->head = pos; memcpy(pos, beacon->head, beacon->head_len); pos += beacon->head_len; } if (beacon->tail_len) { new_beacon->tail_len = beacon->tail_len; new_beacon->tail = pos; memcpy(pos, beacon->tail, beacon->tail_len); pos += beacon->tail_len; } if (beacon->beacon_ies_len) { new_beacon->beacon_ies_len = beacon->beacon_ies_len; new_beacon->beacon_ies = pos; memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len); pos += beacon->beacon_ies_len; } if (beacon->proberesp_ies_len) { new_beacon->proberesp_ies_len = beacon->proberesp_ies_len; new_beacon->proberesp_ies = pos; memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len); pos += beacon->proberesp_ies_len; } if (beacon->assocresp_ies_len) { new_beacon->assocresp_ies_len = beacon->assocresp_ies_len; new_beacon->assocresp_ies = pos; memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len); pos += beacon->assocresp_ies_len; } if (beacon->probe_resp_len) { new_beacon->probe_resp_len = beacon->probe_resp_len; new_beacon->probe_resp = pos; memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { pos += ieee80211_copy_mbssid_beacon(pos, new_beacon->mbssid_ies, beacon->mbssid_ies); if (beacon->rnr_ies && beacon->rnr_ies->cnt) pos += ieee80211_copy_rnr_beacon(pos, new_beacon->rnr_ies, beacon->rnr_ies); } /* might copy -1, meaning no changes requested */ new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; memcpy(pos, beacon->lci, beacon->lci_len); pos += beacon->lci_len; } if (beacon->civicloc) { new_beacon->civicloc_len = beacon->civicloc_len; new_beacon->civicloc = pos; memcpy(pos, beacon->civicloc, beacon->civicloc_len); pos += beacon->civicloc_len; } return new_beacon; } void ieee80211_csa_finish(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; rcu_read_lock(); if (vif->mbssid_tx_vif == vif) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ struct ieee80211_sub_if_data *iter; list_for_each_entry_rcu(iter, &local->interfaces, list) { if (!ieee80211_sdata_running(iter)) continue; if (iter == sdata || iter->vif.mbssid_tx_vif != vif) continue; ieee80211_queue_work(&iter->local->hw, &iter->deflink.csa_finalize_work); } } ieee80211_queue_work(&local->hw, &sdata->deflink.csa_finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_csa_finish); void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_tx) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; sdata->deflink.csa_block_tx = block_tx; sdata_info(sdata, "channel switch failed, disconnecting\n"); wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, u64 *changed) { int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: if (!sdata->deflink.u.ap.next_beacon) return -EINVAL; err = ieee80211_assign_beacon(sdata, &sdata->deflink, sdata->deflink.u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(&sdata->deflink); if (err < 0) return err; break; case NL80211_IFTYPE_ADHOC: err = ieee80211_ibss_finish_csa(sdata, changed); if (err < 0) return err; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: err = ieee80211_mesh_finish_csa(sdata, changed); if (err < 0) return err; break; #endif default: WARN_ON(1); return -EINVAL; } return 0; } static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; sdata_assert_lock(sdata); lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (sdata->deflink.reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (sdata->deflink.reserved_ready) return 0; return ieee80211_link_use_reserved_context(&sdata->deflink); } if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef, &sdata->deflink.csa_chandef)) return -EINVAL; sdata->vif.bss_conf.csa_active = false; err = ieee80211_set_after_csa_beacon(sdata, &changed); if (err) return err; if (sdata->vif.bss_conf.eht_puncturing != sdata->vif.bss_conf.csa_punct_bitmap) { sdata->vif.bss_conf.eht_puncturing = sdata->vif.bss_conf.csa_punct_bitmap; changed |= BSS_CHANGED_EHT_PUNCTURING; } ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); if (sdata->deflink.csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); sdata->deflink.csa_block_tx = false; } err = drv_post_channel_switch(sdata); if (err) return err; cfg80211_ch_switch_notify(sdata->dev, &sdata->deflink.csa_chandef, 0, sdata->vif.bss_conf.eht_puncturing); return 0; } static void ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) { if (__ieee80211_csa_finalize(sdata)) { sdata_info(sdata, "failed to finalize CSA, disconnecting\n"); cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, GFP_KERNEL); } } void ieee80211_csa_finalize_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, deflink.csa_finalize_work); struct ieee80211_local *local = sdata->local; sdata_lock(sdata); mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); /* AP might have been stopped while waiting for the lock. */ if (!sdata->vif.bss_conf.csa_active) goto unlock; if (!ieee80211_sdata_running(sdata)) goto unlock; ieee80211_csa_finalize(sdata); unlock: mutex_unlock(&local->chanctx_mtx); mutex_unlock(&local->mtx); sdata_unlock(sdata); } static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *params, u64 *changed) { struct ieee80211_csa_settings csa = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: sdata->deflink.u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_after); if (!sdata->deflink.u.ap.next_beacon) return -ENOMEM; /* * With a count of 0, we don't have to wait for any * TBTT before switching, so complete the CSA * immediately. In theory, with a count == 1 we * should delay the switch until just before the next * TBTT, but that would complicate things so we switch * immediately too. If we would delay the switch * until the next TBTT, we would have to set the probe * response here. * * TODO: A channel switch with count <= 1 without * sending a CSA action frame is kind of useless, * because the clients won't know we're changing * channels. The action frame must be implemented * either here or in the userspace. */ if (params->count <= 1) break; if ((params->n_counter_offsets_beacon > IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) { ieee80211_free_next_beacon(&sdata->deflink); return -EINVAL; } csa.counter_offsets_beacon = params->counter_offsets_beacon; csa.counter_offsets_presp = params->counter_offsets_presp; csa.n_counter_offsets_beacon = params->n_counter_offsets_beacon; csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; err = ieee80211_assign_beacon(sdata, &sdata->deflink, &params->beacon_csa, &csa, NULL, changed); if (err < 0) { ieee80211_free_next_beacon(&sdata->deflink); return err; } break; case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) return -EINVAL; if (params->chandef.width != sdata->u.ibss.chandef.width) return -EINVAL; switch (params->chandef.width) { case NL80211_CHAN_WIDTH_40: if (cfg80211_get_chandef_type(&params->chandef) != cfg80211_get_chandef_type(&sdata->u.ibss.chandef)) return -EINVAL; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: break; default: return -EINVAL; } /* changes into another band are not supported */ if (sdata->u.ibss.chandef.chan->band != params->chandef.chan->band) return -EINVAL; /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_ibss_csa_beacon(sdata, params, changed); if (err < 0) return err; } ieee80211_send_action_csa(sdata, params); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; /* changes into another band are not supported */ if (sdata->vif.bss_conf.chandef.chan->band != params->chandef.chan->band) return -EINVAL; if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_NONE) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_INIT; if (!ifmsh->pre_value) ifmsh->pre_value = 1; else ifmsh->pre_value++; } /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_mesh_csa_beacon(sdata, params, changed); if (err < 0) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; return err; } } if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) ieee80211_send_action_csa(sdata, params); break; } #endif default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_abort(struct ieee80211_sub_if_data *sdata) { sdata->vif.bss_conf.color_change_active = false; ieee80211_free_next_beacon(&sdata->deflink); cfg80211_color_change_aborted_notify(sdata->dev); } static int __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_channel_switch ch_switch; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; u64 changed = 0; int err; sdata_assert_lock(sdata); lockdep_assert_held(&local->mtx); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; if (sdata->wdev.cac_started) return -EBUSY; if (cfg80211_chandef_identical(&params->chandef, &sdata->vif.bss_conf.chandef)) return -EINVAL; /* don't allow another channel switch if one is already active. */ if (sdata->vif.bss_conf.csa_active) return -EBUSY; mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) { err = -EBUSY; goto out; } if (params->chandef.chan->freq_offset) { /* this may work, but is untested */ err = -EOPNOTSUPP; goto out; } chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; ch_switch.device_timestamp = 0; ch_switch.block_tx = params->block_tx; ch_switch.chandef = params->chandef; ch_switch.count = params->count; err = drv_pre_channel_switch(sdata, &ch_switch); if (err) goto out; err = ieee80211_link_reserve_chanctx(&sdata->deflink, &params->chandef, chanctx->mode, params->radar_required); if (err) goto out; /* if reservation is invalid then this will fail */ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0); if (err) { ieee80211_link_unreserve_chanctx(&sdata->deflink); goto out; } /* if there is a color change in progress, abort it */ if (sdata->vif.bss_conf.color_change_active) ieee80211_color_change_abort(sdata); err = ieee80211_set_csa_beacon(sdata, params, &changed); if (err) { ieee80211_link_unreserve_chanctx(&sdata->deflink); goto out; } if (params->punct_bitmap && !sdata->vif.bss_conf.eht_support) goto out; sdata->deflink.csa_chandef = params->chandef; sdata->deflink.csa_block_tx = params->block_tx; sdata->vif.bss_conf.csa_active = true; sdata->vif.bss_conf.csa_punct_bitmap = params->punct_bitmap; if (sdata->deflink.csa_block_tx) ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); cfg80211_ch_switch_started_notify(sdata->dev, &sdata->deflink.csa_chandef, 0, params->count, params->block_tx, sdata->vif.bss_conf.csa_punct_bitmap); if (changed) { ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); drv_channel_switch_beacon(sdata, &params->chandef); } else { /* if the beacon didn't change, we can finalize immediately */ ieee80211_csa_finalize(sdata); } out: mutex_unlock(&local->chanctx_mtx); return err; } int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; int err; mutex_lock(&local->mtx); err = __ieee80211_channel_switch(wiphy, dev, params); mutex_unlock(&local->mtx); return err; } u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) { lockdep_assert_held(&local->mtx); local->roc_cookie_counter++; /* wow, you wrapped 64 bits ... more likely a bug */ if (WARN_ON(local->roc_cookie_counter == 0)) local->roc_cookie_counter++; return local->roc_cookie_counter; } int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp) { unsigned long spin_flags; struct sk_buff *ack_skb; int id; ack_skb = skb_copy(skb, gfp); if (!ack_skb) return -ENOMEM; spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { kfree_skb(ack_skb); return -ENOMEM; } IEEE80211_SKB_CB(skb)->ack_frame_id = id; *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; return 0; } static void ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); bool global_change, intf_change; global_change = (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || (local->rx_mcast_action_reg != !!(upd->global_mcast_stypes & action_mask)); local->probe_req_reg = upd->global_stypes & preq_mask; local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; intf_change = (sdata->vif.probe_req_reg != !!(upd->interface_stypes & preq_mask)) || (sdata->vif.rx_mcast_action_reg != !!(upd->interface_mcast_stypes & action_mask)); sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; sdata->vif.rx_mcast_action_reg = upd->interface_mcast_stypes & action_mask; if (!local->open_count) return; if (intf_change && ieee80211_sdata_running(sdata)) drv_config_iface_filter(local, sdata, sdata->vif.probe_req_reg ? FIF_PROBE_REQ : 0, FIF_PROBE_REQ); if (global_change) ieee80211_configure_filter(local); } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); if (local->started) return -EOPNOTSUPP; return drv_set_antenna(local, tx_ant, rx_ant); } static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); return drv_get_antenna(local, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!local->ops->set_rekey_data) return -EOPNOTSUPP; drv_set_rekey_data(local, sdata, data); return 0; } static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; struct ieee80211_tx_info *info; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; int ret; /* the lock is needed to assign the cookie later */ mutex_lock(&local->mtx); rcu_read_lock(); sta = sta_info_get_bss(sdata, peer); if (!sta) { ret = -ENOLINK; goto unlock; } qos = sta->sta.wme; chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { ret = -EINVAL; goto unlock; } band = chanctx_conf->def.chan->band; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { ret = -ENOMEM; goto unlock; } skb->dev = dev; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_NL80211_FRAME_TX; info->band = band; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb->priority = 7; if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_ATOMIC); if (ret) { kfree_skb(skb); goto unlock; } local_bh_disable(); ieee80211_xmit(sdata, sta, skb); local_bh_enable(); ret = 0; unlock: rcu_read_unlock(); mutex_unlock(&local->mtx); return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; int ret = -ENODATA; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) { ret = -ENOLINK; goto out; } chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) { *chandef = link->conf->chandef; ret = 0; } else if (local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (local->use_chanctx) *chandef = local->monitor_chandef; else *chandef = local->_oper_chandef; ret = 0; } out: rcu_read_unlock(); return ret; } #ifdef CONFIG_PM static void ieee80211_set_wakeup(struct wiphy *wiphy, bool enabled) { drv_set_wakeup(wiphy_priv(wiphy), enabled); } #endif static int ieee80211_set_qos_map(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_qos_map *qos_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct mac80211_qos_map *new_qos_map, *old_qos_map; if (qos_map) { new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL); if (!new_qos_map) return -ENOMEM; memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map)); } else { /* A NULL qos_map was passed to disable QoS mapping */ new_qos_map = NULL; } old_qos_map = sdata_dereference(sdata->qos_map, sdata); rcu_assign_pointer(sdata->qos_map, new_qos_map); if (old_qos_map) kfree_rcu(old_qos_map, rcu_head); return 0; } static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; int ret; u64 changed = 0; link = sdata_dereference(sdata->link[link_id], sdata); ret = ieee80211_link_change_bandwidth(link, chandef, &changed); if (ret == 0) ieee80211_link_info_change_notify(sdata, link, changed); return ret; } static int ieee80211_add_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer, u8 up, u16 admitted_time) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ac = ieee802_1d_to_ac[up]; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!(sdata->wmm_acm & BIT(up))) return -EINVAL; if (ifmgd->tx_tspec[ac].admitted_time) return -EBUSY; if (admitted_time) { ifmgd->tx_tspec[ac].admitted_time = 32 * admitted_time; ifmgd->tx_tspec[ac].tsid = tsid; ifmgd->tx_tspec[ac].up = up; } return 0; } static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = wiphy_priv(wiphy); int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; /* skip unused entries */ if (!tx_tspec->admitted_time) continue; if (tx_tspec->tsid != tsid) continue; /* due to this new packets will be reassigned to non-ACM ACs */ tx_tspec->up = -1; /* Make sure that all packets have been sent to avoid to * restore the QoS params on packets that are still on the * queues. */ synchronize_net(); ieee80211_flush_queues(local, sdata, false); /* restore the normal QoS parameters * (unconditionally to avoid races) */ tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; tx_tspec->downgraded = false; ieee80211_sta_handle_tspec_ac_params(sdata); /* finally clear all the data */ memset(tx_tspec, 0, sizeof(*tx_tspec)); return 0; } return -ENOENT; } void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; u64 cookie; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } cookie = func->cookie; idr_remove(&sdata->u.nan.function_inst_ids, inst_id); spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_free_nan_func(func); cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id, reason, cookie, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_terminated); void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } match->cookie = func->cookie; spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_match); static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, struct net_device *dev, const bool enabled) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->u.ap.multicast_to_unicast = enabled; return 0; } void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi) { if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES); txqstats->backlog_bytes = txqi->tin.backlog_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS); txqstats->backlog_packets = txqi->tin.backlog_packets; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS); txqstats->flows = txqi->tin.flows; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS); txqstats->drops = txqi->cstats.drop_count; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS); txqstats->ecn_marks = txqi->cstats.ecn_mark; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT); txqstats->overlimit = txqi->tin.overlimit; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS); txqstats->collisions = txqi->tin.collisions; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES); txqstats->tx_bytes = txqi->tin.tx_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS); txqstats->tx_packets = txqi->tin.tx_packets; } } static int ieee80211_get_txq_stats(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; int ret = 0; spin_lock_bh(&local->fq.lock); rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!sdata->vif.txq) { ret = 1; goto out; } ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq)); } else { /* phy stats */ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) | BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) | BIT(NL80211_TXQ_STATS_OVERLIMIT) | BIT(NL80211_TXQ_STATS_OVERMEMORY) | BIT(NL80211_TXQ_STATS_COLLISIONS) | BIT(NL80211_TXQ_STATS_MAX_FLOWS); txqstats->backlog_packets = local->fq.backlog; txqstats->backlog_bytes = local->fq.memory_usage; txqstats->overlimit = local->fq.overlimit; txqstats->overmemory = local->fq.overmemory; txqstats->collisions = local->fq.collisions; txqstats->max_flows = local->fq.flows_cnt; } out: rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; } static int ieee80211_get_ftm_responder_stats(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return drv_get_ftm_responder_stats(local, sdata, ftm_stats); } static int ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_start_pmsr(local, sdata, request); } static void ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_abort_pmsr(local, sdata, request); } static int ieee80211_set_tid_config(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; int ret; if (!sdata->local->ops->set_tid_config) return -EOPNOTSUPP; if (!tid_conf->peer) return drv_set_tid_config(sdata->local, sdata, NULL, tid_conf); mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get_bss(sdata, tid_conf->peer); if (!sta) { mutex_unlock(&sdata->local->sta_mtx); return -ENOENT; } ret = drv_set_tid_config(sdata->local, sdata, &sta->sta, tid_conf); mutex_unlock(&sdata->local->sta_mtx); return ret; } static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; int ret; if (!sdata->local->ops->reset_tid_config) return -EOPNOTSUPP; if (!peer) return drv_reset_tid_config(sdata->local, sdata, NULL, tids); mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get_bss(sdata, peer); if (!sta) { mutex_unlock(&sdata->local->sta_mtx); return -ENOENT; } ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); mutex_unlock(&sdata->local->sta_mtx); return ret; } static int ieee80211_set_sar_specs(struct wiphy *wiphy, struct cfg80211_sar_specs *sar) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_sar_specs) return -EOPNOTSUPP; return local->ops->set_sar_specs(&local->hw, sar); } static int ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, u64 *changed) { switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { int ret; if (!sdata->deflink.u.ap.next_beacon) return -EINVAL; ret = ieee80211_assign_beacon(sdata, &sdata->deflink, sdata->deflink.u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(&sdata->deflink); if (ret < 0) return ret; break; } default: WARN_ON_ONCE(1); return -EINVAL; } return 0; } static int ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_color_change_settings *params, u64 *changed) { struct ieee80211_color_change_settings color_change = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: sdata->deflink.u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_next); if (!sdata->deflink.u.ap.next_beacon) return -ENOMEM; if (params->count <= 1) break; color_change.counter_offset_beacon = params->counter_offset_beacon; color_change.counter_offset_presp = params->counter_offset_presp; color_change.count = params->count; err = ieee80211_assign_beacon(sdata, &sdata->deflink, &params->beacon_color_change, NULL, &color_change, changed); if (err < 0) { ieee80211_free_next_beacon(&sdata->deflink); return err; } break; default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, u8 color, int enable, u64 changed) { sdata->vif.bss_conf.he_bss_color.color = color; sdata->vif.bss_conf.he_bss_color.enabled = enable; changed |= BSS_CHANGED_HE_BSS_COLOR; ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { struct ieee80211_sub_if_data *child; mutex_lock(&sdata->local->iflist_mtx); list_for_each_entry(child, &sdata->local->interfaces, list) { if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) { child->vif.bss_conf.he_bss_color.color = color; child->vif.bss_conf.he_bss_color.enabled = enable; ieee80211_link_info_change_notify(child, &child->deflink, BSS_CHANGED_HE_BSS_COLOR); } } mutex_unlock(&sdata->local->iflist_mtx); } } static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; sdata_assert_lock(sdata); lockdep_assert_held(&local->mtx); sdata->vif.bss_conf.color_change_active = false; err = ieee80211_set_after_color_change_beacon(sdata, &changed); if (err) { cfg80211_color_change_aborted_notify(sdata->dev); return err; } ieee80211_color_change_bss_config_notify(sdata, sdata->vif.bss_conf.color_change_color, 1, changed); cfg80211_color_change_notify(sdata->dev); return 0; } void ieee80211_color_change_finalize_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, deflink.color_change_finalize_work); struct ieee80211_local *local = sdata->local; sdata_lock(sdata); mutex_lock(&local->mtx); /* AP might have been stopped while waiting for the lock. */ if (!sdata->vif.bss_conf.color_change_active) goto unlock; if (!ieee80211_sdata_running(sdata)) goto unlock; ieee80211_color_change_finalize(sdata); unlock: mutex_unlock(&local->mtx); sdata_unlock(sdata); } void ieee80211_color_collision_detection_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct ieee80211_link_data *link = container_of(delayed_work, struct ieee80211_link_data, color_collision_detect_work); struct ieee80211_sub_if_data *sdata = link->sdata; sdata_lock(sdata); cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap); sdata_unlock(sdata); } void ieee80211_color_change_finish(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); ieee80211_queue_work(&sdata->local->hw, &sdata->deflink.color_change_finalize_work); } EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link = &sdata->deflink; if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) return; if (delayed_work_pending(&link->color_collision_detect_work)) return; link->color_bitmap = color_bitmap; /* queue the color collision detection event every 500 ms in order to * avoid sending too much netlink messages to userspace. */ ieee80211_queue_delayed_work(&sdata->local->hw, &link->color_collision_detect_work, msecs_to_jiffies(500)); } EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); static int ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_color_change_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; sdata_assert_lock(sdata); if (sdata->vif.bss_conf.nontransmitted) return -EINVAL; mutex_lock(&local->mtx); /* don't allow another color change if one is already active or if csa * is active */ if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) { err = -EBUSY; goto out; } err = ieee80211_set_color_change_beacon(sdata, params, &changed); if (err) goto out; sdata->vif.bss_conf.color_change_active = true; sdata->vif.bss_conf.color_change_color = params->color; cfg80211_color_change_started_notify(sdata->dev, params->count); if (changed) ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed); else /* if the beacon didn't change, we can finalize immediately */ ieee80211_color_change_finalize(sdata); out: mutex_unlock(&local->mtx); return err; } static int ieee80211_set_radar_background(struct wiphy *wiphy, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_radar_background) return -EOPNOTSUPP; return local->ops->set_radar_background(&local->hw, chandef); } static int ieee80211_add_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int res; if (wdev->use_4addr) return -EOPNOTSUPP; mutex_lock(&sdata->local->mtx); res = ieee80211_vif_set_links(sdata, wdev->valid_links, 0); mutex_unlock(&sdata->local->mtx); return res; } static void ieee80211_del_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); mutex_lock(&sdata->local->mtx); ieee80211_vif_set_links(sdata, wdev->valid_links, 0); mutex_unlock(&sdata->local->mtx); } static int sta_add_link_station(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct link_station_parameters *params) { struct sta_info *sta; int ret; sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!sta->sta.valid_links) return -EINVAL; if (sta->sta.valid_links & BIT(params->link_id)) return -EALREADY; ret = ieee80211_sta_allocate_link(sta, params->link_id); if (ret) return ret; ret = sta_link_apply_parameters(local, sta, true, params); if (ret) { ieee80211_sta_free_link(sta, params->link_id); return ret; } /* ieee80211_sta_activate_link frees the link upon failure */ return ieee80211_sta_activate_link(sta, params->link_id); } static int ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); int ret; mutex_lock(&sdata->local->sta_mtx); ret = sta_add_link_station(local, sdata, params); mutex_unlock(&sdata->local->sta_mtx); return ret; } static int sta_mod_link_station(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct link_station_parameters *params) { struct sta_info *sta; sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; return sta_link_apply_parameters(local, sta, false, params); } static int ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); int ret; mutex_lock(&sdata->local->sta_mtx); ret = sta_mod_link_station(local, sdata, params); mutex_unlock(&sdata->local->sta_mtx); return ret; } static int sta_del_link_station(struct ieee80211_sub_if_data *sdata, struct link_station_del_parameters *params) { struct sta_info *sta; sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; /* must not create a STA without links */ if (sta->sta.valid_links == BIT(params->link_id)) return -EINVAL; ieee80211_sta_remove_link(sta, params->link_id); return 0; } static int ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_del_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int ret; mutex_lock(&sdata->local->sta_mtx); ret = sta_del_link_station(sdata, params); mutex_unlock(&sdata->local->sta_mtx); return ret; } static int ieee80211_set_hw_timestamp(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; if (!local->ops->set_hw_timestamp) return -EOPNOTSUPP; if (!check_sdata_in_driver(sdata)) return -EIO; return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts); } const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, .change_virtual_intf = ieee80211_change_iface, .start_p2p_device = ieee80211_start_p2p_device, .stop_p2p_device = ieee80211_stop_p2p_device, .add_key = ieee80211_add_key, .del_key = ieee80211_del_key, .get_key = ieee80211_get_key, .set_default_key = ieee80211_config_default_key, .set_default_mgmt_key = ieee80211_config_default_mgmt_key, .set_default_beacon_key = ieee80211_config_default_beacon_key, .start_ap = ieee80211_start_ap, .change_beacon = ieee80211_change_beacon, .stop_ap = ieee80211_stop_ap, .add_station = ieee80211_add_station, .del_station = ieee80211_del_station, .change_station = ieee80211_change_station, .get_station = ieee80211_get_station, .dump_station = ieee80211_dump_station, .dump_survey = ieee80211_dump_survey, #ifdef CONFIG_MAC80211_MESH .add_mpath = ieee80211_add_mpath, .del_mpath = ieee80211_del_mpath, .change_mpath = ieee80211_change_mpath, .get_mpath = ieee80211_get_mpath, .dump_mpath = ieee80211_dump_mpath, .get_mpp = ieee80211_get_mpp, .dump_mpp = ieee80211_dump_mpp, .update_mesh_config = ieee80211_update_mesh_config, .get_mesh_config = ieee80211_get_mesh_config, .join_mesh = ieee80211_join_mesh, .leave_mesh = ieee80211_leave_mesh, #endif .join_ocb = ieee80211_join_ocb, .leave_ocb = ieee80211_leave_ocb, .change_bss = ieee80211_change_bss, .inform_bss = ieee80211_inform_bss, .set_txq_params = ieee80211_set_txq_params, .set_monitor_channel = ieee80211_set_monitor_channel, .suspend = ieee80211_suspend, .resume = ieee80211_resume, .scan = ieee80211_scan, .abort_scan = ieee80211_abort_scan, .sched_scan_start = ieee80211_sched_scan_start, .sched_scan_stop = ieee80211_sched_scan_stop, .auth = ieee80211_auth, .assoc = ieee80211_assoc, .deauth = ieee80211_deauth, .disassoc = ieee80211_disassoc, .join_ibss = ieee80211_join_ibss, .leave_ibss = ieee80211_leave_ibss, .set_mcast_rate = ieee80211_set_mcast_rate, .set_wiphy_params = ieee80211_set_wiphy_params, .set_tx_power = ieee80211_set_tx_power, .get_tx_power = ieee80211_get_tx_power, .rfkill_poll = ieee80211_rfkill_poll, CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump) .set_power_mgmt = ieee80211_set_power_mgmt, .set_bitrate_mask = ieee80211_set_bitrate_mask, .remain_on_channel = ieee80211_remain_on_channel, .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, .mgmt_tx = ieee80211_mgmt_tx, .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, .update_mgmt_frame_registrations = ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, .tdls_oper = ieee80211_tdls_oper, .tdls_mgmt = ieee80211_tdls_mgmt, .tdls_channel_switch = ieee80211_tdls_channel_switch, .tdls_cancel_channel_switch = ieee80211_tdls_cancel_channel_switch, .probe_client = ieee80211_probe_client, .set_noack_map = ieee80211_set_noack_map, #ifdef CONFIG_PM .set_wakeup = ieee80211_set_wakeup, #endif .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, .end_cac = ieee80211_end_cac, .channel_switch = ieee80211_channel_switch, .set_qos_map = ieee80211_set_qos_map, .set_ap_chanwidth = ieee80211_set_ap_chanwidth, .add_tx_ts = ieee80211_add_tx_ts, .del_tx_ts = ieee80211_del_tx_ts, .start_nan = ieee80211_start_nan, .stop_nan = ieee80211_stop_nan, .nan_change_conf = ieee80211_nan_change_conf, .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, .tx_control_port = ieee80211_tx_control_port, .get_txq_stats = ieee80211_get_txq_stats, .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, .start_pmsr = ieee80211_start_pmsr, .abort_pmsr = ieee80211_abort_pmsr, .probe_mesh_link = ieee80211_probe_mesh_link, .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, .color_change = ieee80211_color_change, .set_radar_background = ieee80211_set_radar_background, .add_intf_link = ieee80211_add_intf_link, .del_intf_link = ieee80211_del_intf_link, .add_link_station = ieee80211_add_link_station, .mod_link_station = ieee80211_mod_link_station, .del_link_station = ieee80211_del_link_station, .set_hw_timestamp = ieee80211_set_hw_timestamp, };
1135 610 610 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction attribute tables * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" /* Attribute search APIs */ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) { return inat_primary_table[opcode]; } int inat_get_last_prefix_id(insn_byte_t last_pfx) { insn_attr_t lpfx_attr; lpfx_attr = inat_get_opcode_attribute(last_pfx); return inat_last_prefix_id(lpfx_attr); } insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; int n; n = inat_escape_id(esc_attr); table = inat_escape_tables[n][0]; if (!table) return 0; if (inat_has_variant(table[opcode]) && lpfx_id) { table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; int n; n = inat_group_id(grp_attr); table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } return table[X86_MODRM_REG(modrm)] | inat_group_common_attribute(grp_attr); } insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_p) { const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; /* At first, this checks the master table */ table = inat_avx_tables[vex_m][0]; if (!table) return 0; if (!inat_is_group(table[opcode]) && vex_p) { /* If this is not a group, get attribute directly */ table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; } return table[opcode]; }
5910 4482 5812 2301 4402 4400 4401 4400 4483 4483 4482 2304 4483 2240 4397 3081 2301 970 969 2298 2298 136 2301 1340 4399 201 4400 4400 4402 4401 2496 4402 4402 4402 4270 185 2304 2305 2305 2303 2302 2302 2301 2301 2300 2301 5940 5929 5930 5928 5929 5585 5916 5920 2522 2520 354 354 2336 2340 2341 44 2305 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 // SPDX-License-Identifier: GPL-2.0 /* * KFENCE guarded object allocator and fault handling. * * Copyright (C) 2020, Google LLC. */ #define pr_fmt(fmt) "kfence: " fmt #include <linux/atomic.h> #include <linux/bug.h> #include <linux/debugfs.h> #include <linux/hash.h> #include <linux/irq_work.h> #include <linux/jhash.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/log2.h> #include <linux/memblock.h> #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/panic_notifier.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <asm/kfence.h> #include "kfence.h" /* Disables KFENCE on the first warning assuming an irrecoverable error. */ #define KFENCE_WARN_ON(cond) \ ({ \ const bool __cond = WARN_ON(cond); \ if (unlikely(__cond)) { \ WRITE_ONCE(kfence_enabled, false); \ disabled_by_warn = true; \ } \ __cond; \ }) /* === Data ================================================================= */ static bool kfence_enabled __read_mostly; static bool disabled_by_warn __read_mostly; unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif #define MODULE_PARAM_PREFIX "kfence." static int kfence_enable_late(void); static int param_set_sample_interval(const char *val, const struct kernel_param *kp) { unsigned long num; int ret = kstrtoul(val, 0, &num); if (ret < 0) return ret; /* Using 0 to indicate KFENCE is disabled. */ if (!num && READ_ONCE(kfence_enabled)) { pr_info("disabled\n"); WRITE_ONCE(kfence_enabled, false); } *((unsigned long *)kp->arg) = num; if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) return disabled_by_warn ? -EINVAL : kfence_enable_late(); return 0; } static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) { if (!READ_ONCE(kfence_enabled)) return sprintf(buffer, "0\n"); return param_get_ulong(buffer, kp); } static const struct kernel_param_ops sample_interval_param_ops = { .set = param_set_sample_interval, .get = param_get_sample_interval, }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); /* Pool usage% threshold when currently covered allocations are skipped. */ static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); /* If true, use a deferrable timer. */ static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); module_param_named(deferrable, kfence_deferrable, bool, 0444); /* If true, check all canary bytes on panic. */ static bool kfence_check_on_panic __read_mostly; module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444); /* The pool of pages used for guard pages and objects. */ char *__kfence_pool __read_mostly; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ /* * Per-object metadata, with one-to-one mapping of object metadata to * backing pages (in __kfence_pool). */ static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); struct kfence_metadata *kfence_metadata __read_mostly; /* * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache(). * So introduce kfence_metadata_init to initialize metadata, and then make * kfence_metadata visible after initialization is successful. This prevents * potential UAF or access to uninitialized metadata. */ static struct kfence_metadata *kfence_metadata_init __read_mostly; /* Freelist with available objects. */ static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ /* * The static key to set up a KFENCE allocation; or if static keys are not used * to gate allocations, to avoid a load and compare if KFENCE is disabled. */ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); /* Gates the allocation, ensuring only one succeeds in a given period. */ atomic_t kfence_allocation_gate = ATOMIC_INIT(1); /* * A Counting Bloom filter of allocation coverage: limits currently covered * allocations of the same source filling up the pool. * * Assuming a range of 15%-85% unique allocations in the pool at any point in * time, the below parameters provide a probablity of 0.02-0.33 for false * positive hits respectively: * * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM */ #define ALLOC_COVERED_HNUM 2 #define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) #define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) #define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) #define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; /* Stack depth used to determine uniqueness of an allocation. */ #define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) /* * Randomness for stack hashes, making the same collisions across reboots and * different machines less likely. */ static u32 stack_hash_seed __ro_after_init; /* Statistics counters for debugfs. */ enum kfence_counter_id { KFENCE_COUNTER_ALLOCATED, KFENCE_COUNTER_ALLOCS, KFENCE_COUNTER_FREES, KFENCE_COUNTER_ZOMBIES, KFENCE_COUNTER_BUGS, KFENCE_COUNTER_SKIP_INCOMPAT, KFENCE_COUNTER_SKIP_CAPACITY, KFENCE_COUNTER_SKIP_COVERED, KFENCE_COUNTER_COUNT, }; static atomic_long_t counters[KFENCE_COUNTER_COUNT]; static const char *const counter_names[] = { [KFENCE_COUNTER_ALLOCATED] = "currently allocated", [KFENCE_COUNTER_ALLOCS] = "total allocations", [KFENCE_COUNTER_FREES] = "total frees", [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", [KFENCE_COUNTER_BUGS] = "total bugs", [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", }; static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); /* === Internals ============================================================ */ static inline bool should_skip_covered(void) { unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; } static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) { num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); num_entries = filter_irq_stacks(stack_entries, num_entries); return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); } /* * Adds (or subtracts) count @val for allocation stack trace hash * @alloc_stack_hash from Counting Bloom filter. */ static void alloc_covered_add(u32 alloc_stack_hash, int val) { int i; for (i = 0; i < ALLOC_COVERED_HNUM; i++) { atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); } } /* * Returns true if the allocation stack trace hash @alloc_stack_hash is * currently contained (non-zero count) in Counting Bloom filter. */ static bool alloc_covered_contains(u32 alloc_stack_hash) { int i; for (i = 0; i < ALLOC_COVERED_HNUM; i++) { if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) return false; alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); } return true; } static bool kfence_protect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); } static bool kfence_unprotect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); } static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) { unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; /* The checks do not affect performance; only called from slow-paths. */ /* Only call with a pointer into kfence_metadata. */ if (KFENCE_WARN_ON(meta < kfence_metadata || meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) return 0; /* * This metadata object only ever maps to 1 page; verify that the stored * address is in the expected range. */ if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) return 0; return pageaddr; } /* * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. */ static noinline void metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; lockdep_assert_held(&meta->lock); if (stack_entries) { memcpy(track->stack_entries, stack_entries, num_stack_entries * sizeof(stack_entries[0])); } else { /* * Skip over 1 (this) functions; noinline ensures we do not * accidentally skip over the caller by never inlining. */ num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); } track->num_stack_entries = num_stack_entries; track->pid = task_pid_nr(current); track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ /* * Pairs with READ_ONCE() in * kfence_shutdown_cache(), * kfence_handle_page_fault(). */ WRITE_ONCE(meta->state, next); } /* Check canary byte at @addr. */ static inline bool check_canary_byte(u8 *addr) { struct kfence_metadata *meta; unsigned long flags; if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr))) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); meta = addr_to_metadata((unsigned long)addr); raw_spin_lock_irqsave(&meta->lock, flags); kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); raw_spin_unlock_irqrestore(&meta->lock, flags); return false; } static inline void set_canary(const struct kfence_metadata *meta) { const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr = pageaddr; /* * The canary may be written to part of the object memory, but it does * not affect it. The user should initialize the object before using it. */ for (; addr < meta->addr; addr += sizeof(u64)) *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64)); for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; } static inline void check_canary(const struct kfence_metadata *meta) { const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr = pageaddr; /* * We'll iterate over each canary byte per-side until a corrupted byte * is found. However, we'll still iterate over the canary bytes to the * right of the object even if there was an error in the canary bytes to * the left of the object. Specifically, if check_canary_byte() * generates an error, showing both sides might give more clues as to * what the error is about when displaying which bytes were corrupted. */ /* Apply to left of object. */ for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) { if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) break; } /* * If the canary is corrupted in a certain 64 bytes, or the canary * memory cannot be completely covered by multiple consecutive 64 bytes, * it needs to be checked one by one. */ for (; addr < meta->addr; addr++) { if (unlikely(!check_canary_byte((u8 *)addr))) break; } /* Apply to right of object. */ for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) { if (unlikely(!check_canary_byte((u8 *)addr))) return; } for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) { if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) { for (; addr - pageaddr < PAGE_SIZE; addr++) { if (!check_canary_byte((u8 *)addr)) return; } } } } static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, unsigned long *stack_entries, size_t num_stack_entries, u32 alloc_stack_hash) { struct kfence_metadata *meta = NULL; unsigned long flags; struct slab *slab; void *addr; const bool random_right_allocate = get_random_u32_below(2); const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS); /* Try to obtain a free object. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); if (!list_empty(&kfence_freelist)) { meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); list_del_init(&meta->list); } raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); if (!meta) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); return NULL; } if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { /* * This is extremely unlikely -- we are reporting on a * use-after-free, which locked meta->lock, and the reporting * code via printk calls kmalloc() which ends up in * kfence_alloc() and tries to grab the same object that we're * reporting on. While it has never been observed, lockdep does * report that there is a possibility of deadlock. Fix it by * using trylock and bailing out gracefully. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); /* Put the object back on the freelist. */ list_add_tail(&meta->list, &kfence_freelist); raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); return NULL; } meta->addr = metadata_to_pageaddr(meta); /* Unprotect if we're reusing this page. */ if (meta->state == KFENCE_OBJECT_FREED) kfence_unprotect(meta->addr); /* * Note: for allocations made before RNG initialization, will always * return zero. We still benefit from enabling KFENCE as early as * possible, even when the RNG is not yet available, as this will allow * KFENCE to detect bugs due to earlier allocations. The only downside * is that the out-of-bounds accesses detected are deterministic for * such allocations. */ if (random_right_allocate) { /* Allocate on the "right" side, re-calculate address. */ meta->addr += PAGE_SIZE - size; meta->addr = ALIGN_DOWN(meta->addr, cache->align); } addr = (void *)meta->addr; /* Update remaining metadata. */ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ WRITE_ONCE(meta->cache, cache); meta->size = size; meta->alloc_stack_hash = alloc_stack_hash; raw_spin_unlock_irqrestore(&meta->lock, flags); alloc_covered_add(alloc_stack_hash, 1); /* Set required slab fields. */ slab = virt_to_slab((void *)meta->addr); slab->slab_cache = cache; #if defined(CONFIG_SLUB) slab->objects = 1; #elif defined(CONFIG_SLAB) slab->s_mem = addr; #endif /* Memory initialization. */ set_canary(meta); /* * We check slab_want_init_on_alloc() ourselves, rather than letting * SL*B do the initialization, as otherwise we might overwrite KFENCE's * redzone. */ if (unlikely(slab_want_init_on_alloc(gfp, cache))) memzero_explicit(addr, size); if (cache->ctor) cache->ctor(addr); if (random_fault) kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); return addr; } static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) { struct kcsan_scoped_access assert_page_exclusive; unsigned long flags; bool init; raw_spin_lock_irqsave(&meta->lock, flags); if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, &assert_page_exclusive); if (CONFIG_KFENCE_STRESS_TEST_FAULTS) kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } /* Mark the object as freed. */ metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); init = slab_want_init_on_free(meta->cache); raw_spin_unlock_irqrestore(&meta->lock, flags); alloc_covered_add(meta->alloc_stack_hash, -1); /* Check canary bytes for memory corruption. */ check_canary(meta); /* * Clear memory if init-on-free is set. While we protect the page, the * data is still there, and after a use-after-free is detected, we * unprotect the page, so the data is still accessible. */ if (!zombie && unlikely(init)) memzero_explicit(addr, meta->size); /* Protect to detect use-after-frees. */ kfence_protect((unsigned long)addr); kcsan_end_scoped_access(&assert_page_exclusive); if (!zombie) { /* Add it to the tail of the freelist for reuse. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); KFENCE_WARN_ON(!list_empty(&meta->list)); list_add_tail(&meta->list, &kfence_freelist); raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); } else { /* See kfence_shutdown_cache(). */ atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); } } static void rcu_guarded_free(struct rcu_head *h) { struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); kfence_guarded_free((void *)meta->addr, meta, false); } /* * Initialization of the KFENCE pool after its allocation. * Returns 0 on success; otherwise returns the address up to * which partial initialization succeeded. */ static unsigned long kfence_init_pool(void) { unsigned long addr; struct page *pages; int i; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; addr = (unsigned long)__kfence_pool; pages = virt_to_page(__kfence_pool); /* * Set up object pages: they must have PG_slab set, to avoid freeing * these as real pages. * * We also want to avoid inserting kfence_free() in the kfree() * fast-path in SLUB, and therefore need to ensure kfree() correctly * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg | MEMCG_DATA_OBJCGS; #endif } /* * Protect the first 2 pages. The first page is mostly unnecessary, and * merely serves as an extended guard page. However, adding one * additional page in the beginning gives us an even number of pages, * which simplifies the mapping of address to metadata index. */ for (i = 0; i < 2; i++) { if (unlikely(!kfence_protect(addr))) return addr; addr += PAGE_SIZE; } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata_init[i]; /* Initialize metadata. */ INIT_LIST_HEAD(&meta->list); raw_spin_lock_init(&meta->lock); meta->state = KFENCE_OBJECT_UNUSED; meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ list_add_tail(&meta->list, &kfence_freelist); /* Protect the right redzone. */ if (unlikely(!kfence_protect(addr + PAGE_SIZE))) goto reset_slab; addr += 2 * PAGE_SIZE; } /* * Make kfence_metadata visible only when initialization is successful. * Otherwise, if the initialization fails and kfence_metadata is freed, * it may cause UAF in kfence_shutdown_cache(). */ smp_store_release(&kfence_metadata, kfence_metadata_init); return 0; reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; #ifdef CONFIG_MEMCG slab->memcg_data = 0; #endif __folio_clear_slab(slab_folio(slab)); } return addr; } static bool __init kfence_init_pool_early(void) { unsigned long addr; if (!__kfence_pool) return false; addr = kfence_init_pool(); if (!addr) { /* * The pool is live and will never be deallocated from this point on. * Ignore the pool object from the kmemleak phys object tree, as it would * otherwise overlap with allocations returned by kfence_alloc(), which * are registered with kmemleak through the slab post-alloc hook. */ kmemleak_ignore_phys(__pa(__kfence_pool)); return true; } /* * Only release unprotected pages, and do not try to go back and change * page attributes due to risk of failing to do so as well. If changing * page attributes for some pages fails, it is very likely that it also * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); kfence_metadata_init = NULL; return false; } /* === DebugFS Interface ==================================================== */ static int stats_show(struct seq_file *seq, void *v) { int i; seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); for (i = 0; i < KFENCE_COUNTER_COUNT; i++) seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); return 0; } DEFINE_SHOW_ATTRIBUTE(stats); /* * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. * start_object() and next_object() return the object index + 1, because NULL is used * to stop iteration. */ static void *start_object(struct seq_file *seq, loff_t *pos) { if (*pos < CONFIG_KFENCE_NUM_OBJECTS) return (void *)((long)*pos + 1); return NULL; } static void stop_object(struct seq_file *seq, void *v) { } static void *next_object(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (*pos < CONFIG_KFENCE_NUM_OBJECTS) return (void *)((long)*pos + 1); return NULL; } static int show_object(struct seq_file *seq, void *v) { struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; unsigned long flags; raw_spin_lock_irqsave(&meta->lock, flags); kfence_print_object(seq, meta); raw_spin_unlock_irqrestore(&meta->lock, flags); seq_puts(seq, "---------------------------------\n"); return 0; } static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; DEFINE_SEQ_ATTRIBUTE(objects); static int kfence_debugfs_init(void) { struct dentry *kfence_dir; if (!READ_ONCE(kfence_enabled)) return 0; kfence_dir = debugfs_create_dir("kfence", NULL); debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); return 0; } late_initcall(kfence_debugfs_init); /* === Panic Notifier ====================================================== */ static void kfence_check_all_canary(void) { int i; for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata[i]; if (meta->state == KFENCE_OBJECT_ALLOCATED) check_canary(meta); } } static int kfence_check_canary_callback(struct notifier_block *nb, unsigned long reason, void *arg) { kfence_check_all_canary(); return NOTIFY_OK; } static struct notifier_block kfence_check_canary_notifier = { .notifier_call = kfence_check_canary_callback, }; /* === Allocation Gate Timer ================================================ */ static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS /* Wait queue to wake up allocation-gate timer task. */ static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); static void wake_up_kfence_timer(struct irq_work *work) { wake_up(&allocation_wait); } static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); #endif /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a * static key cannot be done from an interrupt. * * Note: Toggling a static branch currently causes IPIs, and here we'll end up * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with * more aggressive sampling intervals), we could get away with a variant that * avoids IPIs, at the cost of not immediately capturing allocations if the * instructions remain cached. */ static void toggle_allocation_gate(struct work_struct *work) { if (!READ_ONCE(kfence_enabled)) return; atomic_set(&kfence_allocation_gate, 0); #ifdef CONFIG_KFENCE_STATIC_KEYS /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif queue_delayed_work(system_unbound_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } /* === Public interface ===================================================== */ void __init kfence_alloc_pool_and_metadata(void) { if (!kfence_sample_interval) return; /* * If the pool has already been initialized by arch, there is no need to * re-allocate the memory pool. */ if (!__kfence_pool) __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!__kfence_pool) { pr_err("failed to allocate pool\n"); return; } /* The memory allocated by memblock has been zeroed out. */ kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE); if (!kfence_metadata_init) { pr_err("failed to allocate metadata\n"); memblock_free(__kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = NULL; } } static void kfence_init_enable(void) { if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) static_branch_enable(&kfence_allocation_key); if (kfence_deferrable) INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate); else INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); if (kfence_check_on_panic) atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); } void __init kfence_init(void) { stack_hash_seed = get_random_u32(); /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ if (!kfence_sample_interval) return; if (!kfence_init_pool_early()) { pr_err("%s failed\n", __func__); return; } kfence_init_enable(); } static int kfence_init_late(void) { const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE; const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE; unsigned long addr = (unsigned long)__kfence_pool; unsigned long free_size = KFENCE_POOL_SIZE; int err = -ENOMEM; #ifdef CONFIG_CONTIG_ALLOC struct page *pages; pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, NULL); if (!pages) return -ENOMEM; __kfence_pool = page_to_virt(pages); pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, NULL); if (pages) kfence_metadata_init = page_to_virt(pages); #else if (nr_pages_pool > MAX_ORDER_NR_PAGES || nr_pages_meta > MAX_ORDER_NR_PAGES) { pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); return -EINVAL; } __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); if (!__kfence_pool) return -ENOMEM; kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); #endif if (!kfence_metadata_init) goto free_pool; memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE); addr = kfence_init_pool(); if (!addr) { kfence_init_enable(); kfence_debugfs_init(); return 0; } pr_err("%s failed\n", __func__); free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); err = -EBUSY; #ifdef CONFIG_CONTIG_ALLOC free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)), nr_pages_meta); free_pool: free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); #else free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE); free_pool: free_pages_exact((void *)addr, free_size); #endif kfence_metadata_init = NULL; __kfence_pool = NULL; return err; } static int kfence_enable_late(void) { if (!__kfence_pool) return kfence_init_late(); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("re-enabled\n"); return 0; } void kfence_shutdown_cache(struct kmem_cache *s) { unsigned long flags; struct kfence_metadata *meta; int i; /* Pairs with release in kfence_init_pool(). */ if (!smp_load_acquire(&kfence_metadata)) return; for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { bool in_use; meta = &kfence_metadata[i]; /* * If we observe some inconsistent cache and state pair where we * should have returned false here, cache destruction is racing * with either kmem_cache_alloc() or kmem_cache_free(). Taking * the lock will not help, as different critical section * serialization will have the same outcome. */ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) continue; raw_spin_lock_irqsave(&meta->lock, flags); in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; raw_spin_unlock_irqrestore(&meta->lock, flags); if (in_use) { /* * This cache still has allocations, and we should not * release them back into the freelist so they can still * safely be used and retain the kernel's default * behaviour of keeping the allocations alive (leak the * cache); however, they effectively become "zombie * allocations" as the KFENCE objects are the only ones * still in use and the owning cache is being destroyed. * * We mark them freed, so that any subsequent use shows * more useful error messages that will include stack * traces of the user of the object, the original * allocation, and caller to shutdown_cache(). */ kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); } } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { meta = &kfence_metadata[i]; /* See above. */ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) continue; raw_spin_lock_irqsave(&meta->lock, flags); if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) meta->cache = NULL; raw_spin_unlock_irqrestore(&meta->lock, flags); } } void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { unsigned long stack_entries[KFENCE_STACK_DEPTH]; size_t num_stack_entries; u32 alloc_stack_hash; /* * Perform size check before switching kfence_allocation_gate, so that * we don't disable KFENCE without making an allocation. */ if (size > PAGE_SIZE) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; } /* * Skip allocations from non-default zones, including DMA. We cannot * guarantee that pages in the KFENCE pool will have the requested * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; } /* * Skip allocations for this slab, if KFENCE has been disabled for * this slab. */ if (s->flags & SLAB_SKIP_KFENCE) return NULL; if (atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* * waitqueue_active() is fully ordered after the update of * kfence_allocation_gate per atomic_inc_return(). */ if (waitqueue_active(&allocation_wait)) { /* * Calling wake_up() here may deadlock when allocations happen * from within timer code. Use an irq_work to defer it. */ irq_work_queue(&wake_up_kfence_timer_work); } #endif if (!READ_ONCE(kfence_enabled)) return NULL; num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); /* * Do expensive check for coverage of allocation in slow-path after * allocation_gate has already become non-zero, even though it might * mean not making any allocation within a given sample interval. * * This ensures reasonable allocation coverage when the pool is almost * full, including avoiding long-lived allocations of the same source * filling up the pool (e.g. pagecache allocations). */ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); return NULL; } return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, alloc_stack_hash); } size_t kfence_ksize(const void *addr) { const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); /* * Read locklessly -- if there is a race with __kfence_alloc(), this is * either a use-after-free or invalid access. */ return meta ? meta->size : 0; } void *kfence_object_start(const void *addr) { const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); /* * Read locklessly -- if there is a race with __kfence_alloc(), this is * either a use-after-free or invalid access. */ return meta ? (void *)meta->addr : NULL; } void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); #ifdef CONFIG_MEMCG KFENCE_WARN_ON(meta->objcg); #endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing * the object, as the object page may be recycled for other-typed * objects once it has been freed. meta->cache may be NULL if the cache * was destroyed. */ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) call_rcu(&meta->rcu_head, rcu_guarded_free); else kfence_guarded_free(addr, meta, false); } bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; enum kfence_error_type error_type; unsigned long flags; if (!is_kfence_address((void *)addr)) return false; if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ return kfence_unprotect(addr); /* ... unprotect and proceed. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); if (page_index % 2) { /* This is a redzone, report a buffer overflow. */ struct kfence_metadata *meta; int distance = 0; meta = addr_to_metadata(addr - PAGE_SIZE); if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { to_report = meta; /* Data race ok; distance calculation approximate. */ distance = addr - data_race(meta->addr + meta->size); } meta = addr_to_metadata(addr + PAGE_SIZE); if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { /* Data race ok; distance calculation approximate. */ if (!to_report || distance > data_race(meta->addr) - addr) to_report = meta; } if (!to_report) goto out; raw_spin_lock_irqsave(&to_report->lock, flags); to_report->unprotected_page = addr; error_type = KFENCE_ERROR_OOB; /* * If the object was freed before we took the look we can still * report this as an OOB -- the report will simply show the * stacktrace of the free as well. */ } else { to_report = addr_to_metadata(addr); if (!to_report) goto out; raw_spin_lock_irqsave(&to_report->lock, flags); error_type = KFENCE_ERROR_UAF; /* * We may race with __kfence_alloc(), and it is possible that a * freed object may be reallocated. We simply report this as a * use-after-free, with the stack trace showing the place where * the object was re-allocated. */ } out: if (to_report) { kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ }
19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/cgroup-defs.h - basic definitions for cgroup * * This file provides basic type and interface. Include this file directly * only if necessary to avoid cyclic dependencies. */ #ifndef _LINUX_CGROUP_DEFS_H #define _LINUX_CGROUP_DEFS_H #include <linux/limits.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/wait.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup-defs.h> #include <linux/psi_types.h> #ifdef CONFIG_CGROUPS struct cgroup; struct cgroup_root; struct cgroup_subsys; struct cgroup_taskset; struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 #define MAX_CFTYPE_NAME 64 /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, enum cgroup_subsys_id { #include <linux/cgroup_subsys.h> CGROUP_SUBSYS_COUNT, }; #undef SUBSYS /* bits in struct cgroup_subsys_state flags field */ enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_VISIBLE = (1 << 3), /* css is visible to userland */ CSS_DYING = (1 << 4), /* css is dying */ }; /* bits in struct cgroup flags field */ enum { /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* * Clone the parent's configuration when creating a new child * cpuset cgroup. For historical reasons, this option can be * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, /* Control group has to be frozen. */ CGRP_FREEZE, /* Cgroup is frozen. */ CGRP_FROZEN, /* Control group has to be killed. */ CGRP_KILL, }; /* cgroup_root->flags */ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ /* * Consider namespaces as delegation boundaries. If this flag is * set, controller specific interface files in a namespace root * aren't writeable from inside the namespace. */ CGRP_ROOT_NS_DELEGATE = (1 << 3), /* * Reduce latencies on dynamic cgroup modifications such as task * migrations and controller on/offs by disabling percpu operation on * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from * favordynmod. */ CGRP_ROOT_FAVOR_DYNMODS = (1 << 4), /* * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 16), /* * Enable legacy local memory.events. */ CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17), /* * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can * be obtained by setting cftype->file_offset. */ struct cgroup_file { /* do not access any fields from outside cgroup core */ struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; }; /* * Per-subsystem/per-cgroup state maintained by the system. This is the * fundamental structural building block that controllers deal with. * * Fields marked with "PI:" are public and immutable and may be accessed * directly without synchronization. */ struct cgroup_subsys_state { /* PI: the cgroup that this css is attached to */ struct cgroup *cgroup; /* PI: the cgroup subsystem that this css is attached to */ struct cgroup_subsys *ss; /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; /* siblings list anchored at the parent's ->children */ struct list_head sibling; struct list_head children; /* flush target list anchored at cgrp->rstat_css_list */ struct list_head rstat_css_node; /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). */ int id; unsigned int flags; /* * Monotonically increasing unique serial number which defines a * uniform order among all csses. It's guaranteed that all * ->children lists are in the ascending order of ->serial_nr and * used to allow interrupting and resuming iterations. */ u64 serial_nr; /* * Incremented by online self and children. Used to guarantee that * parents are not offlined before their children. */ atomic_t online_cnt; /* percpu_ref killing and RCU release */ struct work_struct destroy_work; struct rcu_work destroy_rwork; /* * PI: the parent css. Placed here for cache proximity to following * fields of the containing structure. */ struct cgroup_subsys_state *parent; }; /* * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a * list_add()/del() can bump the reference count on the entire cgroup * set for a task. */ struct css_set { /* * Set of subsystem states, one for each subsystem. This array is * immutable after creation apart from the init_css_set during * subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* reference count */ refcount_t refcount; /* * For a domain cgroup, the following points to self. If threaded, * to the matching cset of the nearest domain ancestor. The * dom_cset provides access to the domain cgroup and its csses to * which domain level resource consumptions should be charged. */ struct css_set *dom_cset; /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; /* internal task count, protected by css_set_lock */ int nr_tasks; /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the * process of being migrated out or in. Protected by * css_set_rwsem, but, during migration, once tasks are moved to * mg_tasks, it can be read safely while holding cgroup_mutex. */ struct list_head tasks; struct list_head mg_tasks; struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; /* * On the default hierarchy, ->subsys[ssid] may point to a css * attached to an ancestor instead of the cgroup this css_set is * associated with. The following node is anchored at * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to * iterate through all css's attached to a given cgroup. */ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; /* all threaded csets whose ->dom_cset points to this cset */ struct list_head threaded_csets; struct list_head threaded_csets_node; /* * List running through all cgroup groups in the same hash * slot. Protected by css_set_lock */ struct hlist_node hlist; /* * List of cgrp_cset_links pointing at cgroups referenced from this * css_set. Protected by css_set_lock. */ struct list_head cgrp_links; /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ struct list_head mg_src_preload_node; struct list_head mg_dst_preload_node; struct list_head mg_node; /* * If this cset is acting as the source of migration the following * two fields are set. mg_src_cgrp and mg_dst_cgrp are * respectively the source and destination cgroups of the on-going * migration. mg_dst_cset is the destination cset the target tasks * on this cset should be migrated to. Protected by cgroup_mutex. */ struct cgroup *mg_src_cgrp; struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; /* dead and being drained, ignore for migration */ bool dead; /* For RCU-protected deletion */ struct rcu_head rcu_head; }; struct cgroup_base_stat { struct task_cputime cputime; #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif }; /* * rstat - cgroup scalable recursive statistics. Accounting is done * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the * hierarchy on reads. * * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are * linked into the updated tree. On the following read, propagation only * considers and consumes the updated tree. This makes reading O(the * number of descendants which have been active since last read) instead of * O(the total number of descendants). * * This is important because there can be a lot of (draining) cgroups which * aren't active and stat may be read frequently. The combination can * become very expensive. By propagating selectively, increasing reading * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - * updated_children and updated_next - and the fields which track basic * resource statistics on top of it - bsync, bstat and last_bstat. */ struct cgroup_rstat_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. */ struct u64_stats_sync bsync; struct cgroup_base_stat bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the global counters. */ struct cgroup_base_stat last_bstat; /* * This field is used to record the cumulative per-cpu time of * the cgroup and its descendants. Currently it can be read via * eBPF/drgn etc, and we are still trying to determine how to * expose it in the cgroupfs interface. */ struct cgroup_base_stat subtree_bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through * ->updated_next. * * In addition to being more compact, singly-linked list pointing * to the cgroup makes it unnecessary for each per-cpu struct to * point back to the associated cgroup. * * Protected by per-cpu cgroup_rstat_cpu_lock. */ struct cgroup *updated_children; /* terminated by self cgroup */ struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { /* Should the cgroup and its descendants be frozen. */ bool freeze; /* Should the cgroup actually be frozen? */ int e_freeze; /* Fields below are protected by css_set_lock */ /* Number of frozen descendant cgroups */ int nr_frozen_descendants; /* * Number of tasks, which are counted as frozen: * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; }; struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; unsigned long flags; /* "unsigned long" so bitops work */ /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with * ancestors[] can determine whether a given cgroup is a * descendant of another without traversing the hierarchy. */ int level; /* Maximum allowed descent tree depth */ int max_depth; /* * Keep track of total numbers of visible and dying descent cgroups. * Dying cgroups are cgroups which were deleted by a user, * but are still existing because someone else is holding a reference. * max_descendants is a maximum allowed number of descent cgroups. * * nr_descendants and nr_dying_descendants are protected * by cgroup_mutex and css_set_lock. It's fine to read them holding * any of cgroup_mutex and css_set_lock; for writing both locks * should be held. */ int nr_descendants; int nr_dying_descendants; int max_descendants; /* * Each non-empty css_set associated with this cgroup contributes * one to nr_populated_csets. The counter is zero iff this cgroup * doesn't have any tasks. * * All children which have non-zero nr_populated_csets and/or * nr_populated_children of their own contribute one to either * nr_populated_domain_children or nr_populated_threaded_children * depending on their type. Each counter is zero iff all cgroups * of the type in the subtree proper don't have any tasks. */ int nr_populated_csets; int nr_populated_domain_children; int nr_populated_threaded_children; int nr_threaded_children; /* # of live threaded child cgroups */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ /* handles for "{cpu,memory,io,irq}.pressure" */ struct cgroup_file psi_files[NR_PSI_RESOURCES]; /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through * "cgroup.subtree_control" while ->subtree_ss_mask is the effective * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ u16 subtree_control; u16 subtree_ss_mask; u16 old_subtree_control; u16 old_subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_root *root; /* * List of cgrp_cset_links pointing at css_sets with tasks in this * cgroup. Protected by css_set_lock. */ struct list_head cset_links; /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with * the closest ancestor which has the subsys enabled. The * following lists all css_sets which point to this cgroup's css * for the given subsystem. */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; /* * If !threaded, self. If threaded, it points to the nearest * domain ancestor. Inside a threaded subtree, cgroups are exempt * from process granularity and no-internal-task constraint. * Domain level resource consumptions which aren't tied to a * specific task are charged to the dom_cgrp. */ struct cgroup *dom_cgrp; struct cgroup *old_dom_cgrp; /* used while enabling threaded */ /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. */ struct list_head pidlists; struct mutex pidlist_mutex; /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; /* used to schedule release agent */ struct work_struct release_agent_work; /* used to track pressure stalls */ struct psi_group *psi; /* used to store eBPF programs */ struct cgroup_bpf bpf; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif /* All ancestors including self */ struct cgroup *ancestors[]; }; /* * A cgroup_root represents the root of a cgroup hierarchy, and may be * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ struct cgroup_root { struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ unsigned int subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the * following field. cgrp_ancestor_storage must immediately follow. */ struct cgroup cgrp; /* must follow cgrp for cgrp->ancestors[0], see above */ struct cgroup *cgrp_ancestor_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; /* A list running through the active hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ unsigned int flags; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { /* * By convention, the name should begin with the name of the * subsystem, followed by a period. Zero length string indicates * end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; /* * The maximum length of string, excluding trailing nul, that can * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. */ size_t max_write_len; /* CFTYPE_* flags */ unsigned int flags; /* * If non-zero, should contain the offset from the start of css to * a struct cgroup_file field. cgroup will record the handle of * the created file into it. The recorded handle can be used as * long as the containing css remains accessible. */ unsigned int file_offset; /* * Fields used for internal bookkeeping. Initialized automatically * during registration. */ struct cgroup_subsys *ss; /* NULL for cgroup core files */ struct list_head node; /* anchored at ss->cfts */ struct kernfs_ops *kf_ops; int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* generic seq_file read interface */ int (*seq_show)(struct seq_file *sf, void *v); /* optional ops, implement all or none */ void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); /* * write_u64() is a shortcut for the common case of accepting * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, u64 val); /* * write_s64() is a signed version of write_u64() */ int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); /* * write() is the generic write callback which maps directly to * kernfs write operation and overrides all other operations. * Maximum write size is determined by ->max_write_len. Use * of_css/cft() to access the associated css and cft. */ ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif }; /* * Control Group subsystem type. * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); int (*css_online)(struct cgroup_subsys_state *css); void (*css_offline)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*css_local_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; /* * If %true, the controller, on the default hierarchy, doesn't show * up in "cgroup.controllers" or "cgroup.subtree_control", is * implicitly enabled on all cgroups on the default hierarchy, and * bypasses the "no internal process" constraint. This is for * utility type controllers which is transparent to userland. * * An implicit controller can be stolen from the default hierarchy * anytime and thus must be okay with offline csses from previous * hierarchies coexisting with csses for the current one. */ bool implicit_on_dfl:1; /* * If %true, the controller, supports threaded mode on the default * hierarchy. In a threaded subtree, both process granularity and * no-internal-process constraint are ignored and a threaded * controllers should be able to handle that. * * Note that as an implicit controller is automatically enabled on * all cgroups on the default hierarchy, it should also be * threaded. implicit && !threaded is not supported. */ bool threaded:1; /* the following two fields are initialized automatically during boot */ int id; const char *name; /* optional, initialized automatically during boot if not set */ const char *legacy_name; /* link to parent, protected by cgroup_lock() */ struct cgroup_root *root; /* idr for css->id */ struct idr css_idr; /* * List of cftypes. Each entry is the first entry of an array * terminated by zero length name. */ struct list_head cfts; /* * Base cftypes which are automatically registered. The two can * point to the same array. */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled * together if available. Subsystems enabled due to dependency are * not visible to userland until explicitly enabled. The following * specifies the mask of subsystems that this one depends on. */ unsigned int depends_on; }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes * using a percpu_rw_semaphore. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); } /** * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups * @tsk: target task * * Counterpart of cgroup_threadcgroup_change_begin(). */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { percpu_up_read(&cgroup_threadgroup_rwsem); } #else /* CONFIG_CGROUPS */ #define CGROUP_SUBSYS_COUNT 0 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { might_sleep(); } static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_SOCK_CGROUP_DATA /* * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * * On legacy hierarchies, net_prio and net_cls controllers directly * set attributes on each sock which can then be tested by the network * layer. On the default hierarchy, each sock is associated with the * cgroup it was created in and the networking layer can match the * cgroup directly. */ struct sock_cgroup_data { struct cgroup *cgroup; /* v2 */ #ifdef CONFIG_CGROUP_NET_CLASSID u32 classid; /* v1 */ #endif #ifdef CONFIG_CGROUP_NET_PRIO u16 prioidx; /* v1 */ #endif }; static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_PRIO return READ_ONCE(skcd->prioidx); #else return 1; #endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_CLASSID return READ_ONCE(skcd->classid); #else return 0; #endif } static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { #ifdef CONFIG_CGROUP_NET_PRIO WRITE_ONCE(skcd->prioidx, prioidx); #endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { #ifdef CONFIG_CGROUP_NET_CLASSID WRITE_ONCE(skcd->classid, classid); #endif } #else /* CONFIG_SOCK_CGROUP_DATA */ struct sock_cgroup_data { }; #endif /* CONFIG_SOCK_CGROUP_DATA */ #endif /* _LINUX_CGROUP_DEFS_H */
13 41 41 41 13 13 13 4 13 13 4 1 4 5 4 3 3 3 1 9 8 8 8 8 8 3 8 9 3 3 1 1 2 1 24 22 28 1 3 6 2 2 3 2 17 31 31 28 12 12 11 11 11 11 12 2 2 2 2 2 2 13 13 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/proto.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/random.h> #include <linux/slab.h> #include <net/checksum.h> #include <net/inet_sock.h> #include <net/inet_common.h> #include <net/sock.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> #include "ccid.h" #include "dccp.h" #include "feat.h" #define CREATE_TRACE_POINTS #include "trace.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); DEFINE_PER_CPU(unsigned int, dccp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; #ifdef CONFIG_IP_DCCP_DEBUG static const char *dccp_state_name(const int state) { static const char *const dccp_state_names[] = { [DCCP_OPEN] = "OPEN", [DCCP_REQUESTING] = "REQUESTING", [DCCP_PARTOPEN] = "PARTOPEN", [DCCP_LISTEN] = "LISTEN", [DCCP_RESPOND] = "RESPOND", [DCCP_CLOSING] = "CLOSING", [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", [DCCP_TIME_WAIT] = "TIME_WAIT", [DCCP_CLOSED] = "CLOSED", }; if (state >= DCCP_MAX_STATES) return "INVALID STATE!"; else return dccp_state_names[state]; } #endif void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); switch (state) { case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); /* Client retransmits all Confirm options until entering OPEN */ if (oldstate == DCCP_PARTOPEN) dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || oldstate == DCCP_CLOSING) DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash != NULL && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == DCCP_OPEN) DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_set_state(sk, state); } EXPORT_SYMBOL_GPL(dccp_set_state); static void dccp_finish_passive_close(struct sock *sk) { switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: /* Node (client or server) has received Close packet. */ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_set_state(sk, DCCP_CLOSED); break; case DCCP_PASSIVE_CLOSEREQ: /* * Client received CloseReq. We set the `active' flag so that * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. */ dccp_send_close(sk, 1); dccp_set_state(sk, DCCP_CLOSING); } } void dccp_done(struct sock *sk) { dccp_set_state(sk, DCCP_CLOSED); dccp_clear_xmit_timers(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(dccp_done); const char *dccp_packet_name(const int type) { static const char *const dccp_packet_names[] = { [DCCP_PKT_REQUEST] = "REQUEST", [DCCP_PKT_RESPONSE] = "RESPONSE", [DCCP_PKT_DATA] = "DATA", [DCCP_PKT_ACK] = "ACK", [DCCP_PKT_DATAACK] = "DATAACK", [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", [DCCP_PKT_CLOSE] = "CLOSE", [DCCP_PKT_RESET] = "RESET", [DCCP_PKT_SYNC] = "SYNC", [DCCP_PKT_SYNCACK] = "SYNCACK", }; if (type >= DCCP_NR_PKT_TYPES) return "INVALID"; else return dccp_packet_names[type]; } EXPORT_SYMBOL_GPL(dccp_packet_name); void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; } EXPORT_SYMBOL_GPL(dccp_destruct_common); static void dccp_sk_destruct(struct sock *sk) { dccp_destruct_common(sk); inet_sock_destruct(sk); } int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); INIT_LIST_HEAD(&dp->dccps_featneg); /* control socket doesn't need feat nego */ if (likely(ctl_sock_initialized)) return dccp_feat_init(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } /* Clean up a referenced DCCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(sk); kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); static inline int dccp_need_reset(int state) { return state != DCCP_CLOSED && state != DCCP_LISTEN && state != DCCP_REQUESTING; } int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); const int old_state = sk->sk_state; if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); /* * This corresponds to the ABORT function of RFC793, sec. 3.8 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". */ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); } else if (dccp_need_reset(old_state)) { dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); sk->sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); icsk->icsk_backoff = 0; inet_csk_delack_init(sk); __sk_dst_reset(sk); WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk_error_report(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_disconnect); /* * Wait for a DCCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes made by another threads is impossible in any case. */ mask = 0; if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_is_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } } } return mask; } EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) goto out; switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ rc = 0; } break; case SIOCINQ: { struct sk_buff *skb; *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ *karg = skb->len; } rc = 0; } break; default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; if (service == DCCP_SERVICE_INVALID_VALUE || optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) return -EINVAL; if (optlen > sizeof(service)) { sl = kmalloc(optlen, GFP_KERNEL); if (sl == NULL) return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; if (copy_from_sockptr_offset(sl->dccpsl_list, optval, sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; } } lock_sock(sk); dp->dccps_service = service; kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); return 0; } static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) { u8 *list, len; int i, rc; if (cscov < 0 || cscov > 15) return -EINVAL; /* * Populate a list of permissible values, in the range cscov...15. This * is necessary since feature negotiation of single values only works if * both sides incidentally choose the same value. Since the list starts * lowest-value first, negotiation will pick the smallest shared value. */ if (cscov == 0) return 0; len = 16 - cscov; list = kmalloc(len, GFP_KERNEL); if (list == NULL) return -ENOBUFS; for (i = 0; i < len; i++) list[i] = cscov++; rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); if (rc == 0) { if (rx) dccp_sk(sk)->dccps_pcrlen = cscov; else dccp_sk(sk)->dccps_pcslen = cscov; } kfree(list); return rc; } static int dccp_setsockopt_ccid(struct sock *sk, int type, sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); lock_sock(sk); if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); release_sock(sk); kfree(val); return rc; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CHANGE_L: case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CCID: case DCCP_SOCKOPT_RX_CCID: case DCCP_SOCKOPT_TX_CCID: return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) return dccp_setsockopt_service(sk, val, optval, optlen); lock_sock(sk); switch (optname) { case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; case DCCP_SOCKOPT_SEND_CSCOV: err = dccp_setsockopt_cscov(sk, val, false); break; case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; case DCCP_SOCKOPT_QPOLICY_ID: if (sk->sk_state != DCCP_CLOSED) err = -EISCONN; else if (val < 0 || val >= DCCPQ_POLICY_MAX) err = -EINVAL; else dp->dccps_qpolicy = val; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: if (val < 0) err = -EINVAL; else dp->dccps_tx_qlen = val; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); return do_dccp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_setsockopt); static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) { const struct dccp_sock *dp = dccp_sk(sk); const struct dccp_service_list *sl; int err = -ENOENT, slen = 0, total_len = sizeof(u32); lock_sock(sk); if ((sl = dp->dccps_service_list) != NULL) { slen = sl->dccpsl_nr * sizeof(u32); total_len += slen; } err = -EINVAL; if (total_len > len) goto out; err = 0; if (put_user(total_len, optlen) || put_user(dp->dccps_service, optval) || (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) err = -EFAULT; out: release_sock(sk); return err; } static int do_dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dccp_sock *dp; int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < (int)sizeof(int)) return -EINVAL; dp = dccp_sk(sk); switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_SERVICE: return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_TX_CCID: val = ccid_get_current_tx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_RX_CCID: val = ccid_get_current_rx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp->dccps_pcslen; break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; case DCCP_SOCKOPT_QPOLICY_ID: val = dp->dccps_qpolicy; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: val = dp->dccps_tx_qlen; break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); case 192 ... 255: return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, len, (u32 __user *)optval, optlen); default: return -ENOPROTOOPT; } len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_dccp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_getsockopt); static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; /* * Assign an (opaque) qpolicy priority value to skb->priority. * * We are overloading this skb field for use with the qpolicy subystem. * The skb->priority is normally used for the SO_PRIORITY option, which * is initialised from sk_priority. Since the assignment of sk_priority * to skb->priority happens later (on layer 3), we overload this field * for use with queueing priorities as long as the skb is on layer 4. * The default priority value (if nothing is set) is 0. */ skb->priority = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_DCCP) continue; if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) return -EINVAL; switch (cmsg->cmsg_type) { case DCCP_SCM_PRIORITY: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) return -EINVAL; skb->priority = *(__u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { const struct dccp_sock *dp = dccp_sk(sk); const int flags = msg->msg_flags; const int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; int rc, size; long timeo; trace_dccp_probe(sk, len); if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); timeo = sock_sndtimeo(sk, noblock); /* * We have to use sk_stream_wait_connect here to set sk_write_pending, * so that the trick in dccp_rcv_request_sent_state_process. */ /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_release; size = sk->sk_prot->max_header + len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (skb == NULL) goto out_release; if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_discard; } if (sk->sk_state == DCCP_CLOSED) { rc = -ENOTCONN; goto out_discard; } /* We need to check dccps_mss_cache after socket is locked. */ if (len > dp->dccps_mss_cache) { rc = -EMSGSIZE; goto out_discard; } skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) goto out_discard; rc = dccp_msghdr_parse(msg, skb); if (rc != 0) goto out_discard; dccp_qpolicy_push(sk, skb); /* * The xmit_timer is set if the TX CCID is rate-based and will expire * when congestion control permits to release further packets into the * network. Window-based CCIDs do not use this timer. */ if (!timer_pending(&dp->dccps_xmit_timer)) dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; out_discard: kfree_skb(skb); goto out_release; } EXPORT_SYMBOL_GPL(dccp_sendmsg); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { const struct dccp_hdr *dh; long timeo; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) { len = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) goto verify_sock_status; dh = dccp_hdr(skb); switch (dh->dccph_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: goto found_ok_skb; case DCCP_PKT_CLOSE: case DCCP_PKT_CLOSEREQ: if (!(flags & MSG_PEEK)) dccp_finish_passive_close(sk); fallthrough; case DCCP_PKT_RESET: dccp_pr_debug("found fin (%s) ok!\n", dccp_packet_name(dh->dccph_type)); len = 0; goto found_fin_ok; default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; break; } if (sk->sk_err) { len = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { len = 0; break; } if (sk->sk_state == DCCP_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read * from never connected socket. */ len = -ENOTCONN; break; } len = 0; break; } if (!timeo) { len = -EAGAIN; break; } if (signal_pending(current)) { len = sock_intr_errno(timeo); break; } sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; } if (flags & MSG_TRUNC) len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; } while (1); out: release_sock(sk); return len; } EXPORT_SYMBOL_GPL(dccp_recvmsg); int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != DCCP_LISTEN) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; /* do not start to listen if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dp)) { err = -EPROTO; goto out; } err = inet_csk_listen_start(sk); if (err) goto out; } err = 0; out: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(inet_dccp_listen); static void dccp_terminate_connection(struct sock *sk) { u8 next_state = DCCP_CLOSED; switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: case DCCP_PASSIVE_CLOSEREQ: dccp_finish_passive_close(sk); break; case DCCP_PARTOPEN: dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); fallthrough; case DCCP_OPEN: dccp_send_close(sk, 1); if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && !dccp_sk(sk)->dccps_server_timewait) next_state = DCCP_ACTIVE_CLOSEREQ; else next_state = DCCP_CLOSING; fallthrough; default: dccp_set_state(sk, next_state); } } void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; u32 data_was_unread = 0; int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == DCCP_LISTEN) { dccp_set_state(sk, DCCP_CLOSED); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } sk_stop_timer(sk, &dp->dccps_xmit_timer); /* * We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { data_was_unread += skb->len; __kfree_skb(skb); } /* If socket has been already reset kill it. */ if (sk->sk_state == DCCP_CLOSED) goto adjudge_to_death; if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { /* * Normal connection termination. May need to wait if there are * still packets in the TX queue that are delayed by the CCID. */ dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } /* * Flush write queue. This may be necessary in several cases: * - we have been closed by the peer but still have application data; * - abortive termination (unread data or zero linger time), * - normal termination but queue could not be flushed within time limit */ __skb_queue_purge(&sk->sk_write_queue); sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); /* * It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); /* * Now socket is owned by kernel and we acquire BH lock * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL_GPL(dccp_close); void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("called shutdown(%x)\n", how); } EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) return -ENOMEM; return 0; } static inline void dccp_mib_exit(void) { free_percpu(dccp_statistics); } static int thash_entries; module_param(thash_entries, int, 0444); MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); #ifdef CONFIG_IP_DCCP_DEBUG bool dccp_debug; module_param(dccp_debug, bool, 0644); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; dccp_hashinfo.bind2_bucket_cachep = kmem_cache_create("dccp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind2_bucket_cachep) goto out_free_bind_bucket_cachep; /* * Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (21 - PAGE_SHIFT); else goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) ; do { unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / sizeof(struct inet_ehash_bucket); while (hash_size & (hash_size - 1)) hash_size--; dccp_hashinfo.ehash_mask = hash_size - 1; dccp_hashinfo.ehash = (struct inet_ehash_bucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); } while (!dccp_hashinfo.ehash && --ehash_order > 0); if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); goto out_free_bind2_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&dccp_hashinfo)) goto out_free_dccp_ehash; bhash_order = ehash_order; do { dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / sizeof(struct inet_bind_hashbucket); if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) continue; dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); } while (!dccp_hashinfo.bhash && --bhash_order >= 0); if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); goto out_free_dccp_locks; } dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); if (!dccp_hashinfo.bhash2) { DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); goto out_free_dccp_bhash; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); spin_lock_init(&dccp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } dccp_hashinfo.pernet = false; rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash2; rc = dccp_ackvec_init(); if (rc) goto out_free_dccp_mib; rc = dccp_sysctl_init(); if (rc) goto out_ackvec_exit; rc = ccid_initialize_builtins(); if (rc) goto out_sysctl_exit; dccp_timestamping_init(); return 0; out_sysctl_exit: dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); out_free_dccp_bhash2: free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind2_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash2 = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; dccp_hashinfo.bind2_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { int bhash_order = get_order(dccp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); ccid_cleanup_builtins(); dccp_mib_exit(); free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); } module_init(dccp_init); module_exit(dccp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
1066 1067 1023 1023 1015 1061 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. * Copyright (c) 2023 Greg Kroah-Hartman <gregkh@linuxfoundation.org> */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; /* /sys/bus */ static struct kset *bus_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); /** * bus_to_subsys - Turn a struct bus_type into a struct subsys_private * * @bus: pointer to the struct bus_type to look up * * The driver core internals needs to work on the subsys_private structure, not * the external struct bus_type pointer. This function walks the list of * registered busses in the system and finds the matching one and returns the * internal struct subsys_private that relates to that bus. * * Note, the reference count of the return value is INCREMENTED if it is not * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ static struct subsys_private *bus_to_subsys(const struct bus_type *bus) { struct subsys_private *sp = NULL; struct kobject *kobj; if (!bus || !bus_kset) return NULL; spin_lock(&bus_kset->list_lock); if (list_empty(&bus_kset->list)) goto done; list_for_each_entry(kobj, &bus_kset->list, entry) { struct kset *kset = container_of(kobj, struct kset, kobj); sp = container_of_const(kset, struct subsys_private, subsys); if (sp->bus == bus) goto done; } sp = NULL; done: sp = subsys_get(sp); spin_unlock(&bus_kset->list_lock); return sp; } static const struct bus_type *bus_get(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); if (sp) return bus; return NULL; } static void bus_put(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); /* two puts are required as the call to bus_to_subsys incremented it again */ subsys_put(sp); subsys_put(sp); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static const struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); ssize_t ret = 0; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); ssize_t ret = 0; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); int error; if (!sp) return -EINVAL; error = sysfs_create_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return; sysfs_remove_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); lockdep_unregister_key(&priv->lock_key); kfree(priv); } static const struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (!err) { /* success */ err = count; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = sysfs_emit(buf, "%d\n", sp->drivers_autoprobe); subsys_put(sp); return ret; } static ssize_t drivers_autoprobe_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return -EINVAL; if (buf[0] == '0') sp->drivers_autoprobe = 0; else sp->drivers_autoprobe = 1; subsys_put(sp); return count; } static ssize_t drivers_probe_store(const struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; if (!sp) return NULL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) break; klist_iter_exit(&i); subsys_put(sp); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device_driver *drv; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); int error; if (!sp) { /* * This is a normal operation for many devices that do not * have a bus assigned to them, just say that all went * well. */ return 0; } /* * Reference in sp is now incremented and will be dropped when * the device is removed from the bus */ pr_debug("bus: '%s': add device %s\n", sp->bus->name, dev_name(dev)); error = device_add_groups(dev, sp->bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &sp->klist_devices); return 0; out_subsys: sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, sp->bus->dev_groups); out_put: subsys_put(sp); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; if (sp->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&sp->mutex); subsys_put(sp); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&sp->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_device() */ subsys_put(sp); subsys_put(sp); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(const struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(const struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); struct driver_private *priv; int error = 0; if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the driver is removed from the bus */ pr_debug("bus: '%s': add driver %s\n", sp->bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = sp->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &sp->klist_drivers); if (sp->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_del_list; } module_add_driver(drv->owner, drv); error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, sp->bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_add_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_del_list: klist_del(&priv->knode_bus); out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: subsys_put(sp); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); if (!sp) return; pr_debug("bus: '%s': remove driver %s\n", sp->bus->name, drv->name); if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, sp->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_driver() */ subsys_put(sp); subsys_put(sp); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(const struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = kobject_synth_uevent(&sp->subsys.kobj, buf, count); subsys_put(sp); if (ret) return ret; return count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(const struct bus_type *bus) { int retval; struct subsys_private *priv; struct kobject *bus_kobj; struct lock_class_key *key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); bus_kobj = &priv->subsys.kobj; retval = kobject_set_name(bus_kobj, "%s", bus->name); if (retval) goto out; bus_kobj->kset = bus_kset; bus_kobj->ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, bus_kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, bus_kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); key = &priv->lock_key; lockdep_register_key(key); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = sysfs_create_groups(bus_kobj, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(priv->drivers_kset); bus_drivers_fail: kset_unregister(priv->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&priv->subsys); out: kfree(priv); return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *bus_kobj; if (!sp) return; pr_debug("bus: '%s': unregistering\n", bus->name); if (sp->dev_root) device_unregister(sp->dev_root); bus_kobj = &sp->subsys.kobj; sysfs_remove_groups(bus_kobj, bus->bus_groups); remove_probe_files(bus); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(sp->drivers_kset); kset_unregister(sp->devices_kset); kset_unregister(&sp->subsys); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_register(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_unregister(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_unregister_notifier); void bus_notify(struct device *dev, enum bus_notifier_event value) { struct subsys_private *sp = bus_to_subsys(dev->bus); if (!sp) return; blocking_notifier_call_chain(&sp->bus_notifier, value, dev); subsys_put(sp); } struct kset *bus_get_kset(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kset *kset; if (!sp) return NULL; kset = &sp->subsys; subsys_put(sp); return kset; } EXPORT_SYMBOL_GPL(bus_get_kset); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { struct subsys_private *sp = bus_to_subsys(bus); LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; if (!sp) return; device_klist = &sp->klist_devices; spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); struct subsys_dev_iter { struct klist_iter ki; const struct device_type *type; }; /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @sp: the subsys private (i.e. bus) we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ static void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct subsys_private *sp, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; } /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ static struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ static void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } int subsys_interface_register(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; sp = bus_to_subsys(sif->subsys); if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the interface is removed from the bus */ mutex_lock(&sp->mutex); list_add_tail(&sif->node, &sp->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; sp = bus_to_subsys(sif->subsys); if (!sp) return; mutex_lock(&sp->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in subsys_interface_register() */ subsys_put(sp); subsys_put(sp); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct subsys_private *sp; struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; sp = bus_to_subsys(subsys); if (!sp) { err = -EINVAL; goto err_sp; } dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; sp->dev_root = dev; subsys_put(sp); return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: subsys_put(sp); err_sp: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(NULL); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); /** * driver_find - locate driver on a bus by its name. * @name: name of the driver. * @bus: bus to scan for the driver. * * Call kset_find_obj() to iterate over list of drivers on * a bus to find driver by name. Return driver if found. * * This routine provides no locking to prevent the driver it returns * from being unregistered or unloaded while the caller is using it. * The caller is responsible for preventing this. */ struct device_driver *driver_find(const char *name, const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *k; struct driver_private *priv; if (!sp) return NULL; k = kset_find_obj(sp->drivers_kset, name); subsys_put(sp); if (!k) return NULL; priv = to_driver(k); /* Drop reference added by kset_find_obj() */ kobject_put(k); return priv->driver; } EXPORT_SYMBOL_GPL(driver_find); /* * Warning, the value could go to "removed" instantly after calling this function, so be very * careful when calling it... */ bool bus_is_registered(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); bool is_initialized = false; if (sp) { is_initialized = true; subsys_put(sp); } return is_initialized; } /** * bus_get_dev_root - return a pointer to the "device root" of a bus * @bus: bus to return the device root of. * * If a bus has a "device root" structure, return it, WITH THE REFERENCE * COUNT INCREMENTED. * * Note, when finished with the device, a call to put_device() is required. * * If the device root is not present (or bus is not a valid pointer), NULL * will be returned. */ struct device *bus_get_dev_root(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct device *dev_root; if (!sp) return NULL; dev_root = get_device(sp->dev_root); subsys_put(sp); return dev_root; } EXPORT_SYMBOL_GPL(bus_get_dev_root); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; }
1541 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 /* Copyright 2011, Siemens AG * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ /* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* Jon's code is based on 6lowpan implementation for Contiki which is: * Copyright (c) 2008, Swedish Institute of Computer Science. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the Institute nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <linux/if_arp.h> #include <net/ipv6.h> #include "6lowpan_i.h" static int open_count; static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; static int lowpan_dev_init(struct net_device *ldev) { netdev_lockdep_set_classes(ldev); return 0; } static int lowpan_open(struct net_device *dev) { if (!open_count) lowpan_rx_init(); open_count++; return 0; } static int lowpan_stop(struct net_device *dev) { open_count--; if (!open_count) lowpan_rx_exit(); return 0; } static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); /* default no short_addr is available for a neighbour */ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); return 0; } static int lowpan_get_iflink(const struct net_device *dev) { return lowpan_802154_dev(dev)->wdev->ifindex; } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) { memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; ldev->features |= NETIF_F_NETNS_LOCAL; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) return -EINVAL; } return 0; } static int lowpan_newlink(struct net *src_net, struct net_device *ldev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_device *wdev; int ret; ASSERT_RTNL(); pr_debug("adding new link\n"); if (!tb[IFLA_LINK]) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) return -ENODEV; if (wdev->type != ARPHRD_IEEE802154) { dev_put(wdev); return -EINVAL; } if (wdev->ieee802154_ptr->lowpan_dev) { dev_put(wdev); return -EBUSY; } lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 * header. */ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); return ret; } wdev->ieee802154_ptr->lowpan_dev = ldev; return 0; } static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); wdev->ieee802154_ptr->lowpan_dev = NULL; lowpan_unregister_netdevice(ldev); dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, .validate = lowpan_validate, }; static inline int __init lowpan_netlink_init(void) { return rtnl_link_register(&lowpan_link_ops); } static inline void lowpan_netlink_fini(void) { rtnl_link_unregister(&lowpan_link_ops); } static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev; if (ndev->type != ARPHRD_IEEE802154) return NOTIFY_DONE; wpan_dev = ndev->ieee802154_ptr; if (!wpan_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: /* Check if wpan interface is unregistered that we * also delete possible lowpan interfaces which belongs * to the wpan interface. */ if (wpan_dev->lowpan_dev) lowpan_dellink(wpan_dev->lowpan_dev, NULL); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { .notifier_call = lowpan_device_event, }; static int __init lowpan_init_module(void) { int err = 0; err = lowpan_net_frag_init(); if (err < 0) goto out; err = lowpan_netlink_init(); if (err < 0) goto out_frag; err = register_netdevice_notifier(&lowpan_dev_notifier); if (err < 0) goto out_pack; return 0; out_pack: lowpan_netlink_fini(); out_frag: lowpan_net_frag_exit(); out: return err; } static void __exit lowpan_cleanup_module(void) { lowpan_netlink_fini(); lowpan_net_frag_exit(); unregister_netdevice_notifier(&lowpan_dev_notifier); } module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan");
4555 1352 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/instruction_pointer.h> #include <linux/preempt.h> #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); #else static inline bool local_bh_blocked(void) { return false; } #endif #endif /* _LINUX_BH_H */
708 708 1 5074 119 119 74 74 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TRACE_EVENT_H #define _LINUX_TRACE_EVENT_H #include <linux/ring_buffer.h> #include <linux/trace_seq.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/perf_event.h> #include <linux/tracepoint.h> struct trace_array; struct array_buffer; struct tracer; struct dentry; struct bpf_prog; union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, const struct trace_print_flags_u64 *flag_array); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array); #endif const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size); const char *trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len, bool concatenate); const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); const char * trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); struct trace_iterator; struct trace_event; int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *event); extern __printf(2, 3) void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); /* Used to find the offset and length of dynamic fields in trace events */ struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 len; u16 offset; #else u16 offset; u16 len; #endif } __packed; /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; }; #define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: */ struct trace_iterator { struct trace_array *tr; struct tracer *trace; struct array_buffer *array_buffer; void *private; int cpu_file; struct mutex mutex; struct ring_buffer_iter **buffer_iter; unsigned long iter_flags; void *temp; /* temp holder */ unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; long wait_index; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; cpumask_var_t started; /* it's true when current open file is snapshot */ bool snapshot; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; unsigned long lost_events; int leftover; int ent_size; int cpu; u64 ts; loff_t pos; long idx; /* All new field here will be zeroed out in pipe_read */ }; enum trace_iter_flags { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, TRACE_FILE_TIME_IN_NS = 4, }; typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags, struct trace_event *event); struct trace_event_functions { trace_print_func trace; trace_print_func raw; trace_print_func hex; trace_print_func binary; }; struct trace_event { struct hlist_node node; int type; struct trace_event_functions *funcs; }; extern int register_trace_event(struct trace_event *event); extern int unregister_trace_event(struct trace_event *event); /* Return values for print_line callback */ enum print_line_t { TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ TRACE_TYPE_HANDLED = 1, TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; enum print_line_t trace_handle_return(struct trace_seq *s); static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; entry->pid = current->pid; entry->type = type; entry->flags = trace_ctx >> 16; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, TRACE_FLAG_BH_OFF = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? TRACE_FLAG_IRQS_OFF : 0; return tracing_gen_ctx_irq_test(irq_status); } static inline unsigned int tracing_gen_ctx(void) { unsigned long irqflags; local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } #else static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); } static inline unsigned int tracing_gen_ctx(void) { return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); } #endif static inline unsigned int tracing_gen_ctx_dec(void) { unsigned int trace_ctx; trace_ctx = tracing_gen_ctx(); /* * Subtract one from the preemption counter if preemption is enabled, * see trace_event_buffer_reserve()for details. */ if (IS_ENABLED(CONFIG_PREEMPTION)) trace_ctx--; return trace_ctx; } struct trace_event_file; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, struct trace_event_file *trace_file, int type, unsigned long len, unsigned int trace_ctx); #define TRACE_RECORD_CMDLINE BIT(0) #define TRACE_RECORD_TGID BIT(1) void tracing_record_taskinfo(struct task_struct *task, int flags); void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags); void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) __printf(3, 4); struct event_filter; enum trace_reg { TRACE_REG_REGISTER, TRACE_REG_UNREGISTER, #ifdef CONFIG_PERF_EVENTS TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, /* * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a * custom action was taken and the default action is not to be * performed. */ TRACE_REG_PERF_ADD, TRACE_REG_PERF_DEL, #endif }; struct trace_event_call; #define TRACE_FUNCTION_TYPE ((const char *)~0UL) struct trace_event_fields { const char *type; union { struct { const char *name; const int size; const int align; const int is_signed; const int filter_type; const int len; }; int (*define_fields)(struct trace_event_call *); }; }; struct trace_event_class { const char *system; void *probe; #ifdef CONFIG_PERF_EVENTS void *perf_probe; #endif int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_fields *fields_array; struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; int (*raw_init)(struct trace_event_call *); }; extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_buffer { struct trace_buffer *buffer; struct ring_buffer_event *event; struct trace_event_file *trace_file; void *entry; unsigned int trace_ctx; struct pt_regs *regs; }; void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len); void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, }; /* * Event flags: * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe * EPROBE - Event is an event probe * FPROBE - Event is an function probe * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) struct trace_event_call { struct list_head list; struct trace_event_class *class; union { char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; struct trace_event event; char *print_fmt; struct event_filter *filter; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. */ union { void *module; atomic_t refcnt; }; void *data; /* See the TRACE_EVENT_FL_* flags above */ int flags; /* static flags of different events */ #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog_array __rcu *prog_array; int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; #ifdef CONFIG_DYNAMIC_EVENTS bool trace_event_dyn_try_get_ref(struct trace_event_call *call); void trace_event_dyn_put_ref(struct trace_event_call *call); bool trace_event_dyn_busy(struct trace_event_call *call); #else static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call) { /* Without DYNAMIC_EVENTS configured, nothing should be calling this */ return false; } static inline void trace_event_dyn_put_ref(struct trace_event_call *call) { } static inline bool trace_event_dyn_busy(struct trace_event_call *call) { /* Nothing should call this without DYNAIMIC_EVENTS configured. */ return true; } #endif static inline bool trace_event_try_get_ref(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_DYNAMIC) return trace_event_dyn_try_get_ref(call); else return try_module_get(call->module); } static inline void trace_event_put_ref(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_DYNAMIC) trace_event_dyn_put_ref(call); else module_put(call->module); } #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { /* * This inline function checks whether call->prog_array * is valid or not. The function is called in various places, * outside rcu_read_lock/unlock, as a heuristic to speed up execution. * * If this function returns true, and later call->prog_array * becomes false inside rcu_read_lock/unlock region, * we bail out then. If this function return false, * there is a risk that we might miss a few events if the checking * were delayed until inside rcu_read_lock/unlock region and * call->prog_array happened to become non-NULL then. * * Here, READ_ONCE() is used instead of rcu_access_pointer(). * rcu_access_pointer() requires the actual definition of * "struct bpf_prog_array" while READ_ONCE() only needs * a declaration of the same type. */ return !!READ_ONCE(call->prog_array); } #endif static inline const char * trace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_CUSTOM) return call->name; else if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; else return call->name; } static inline struct list_head * trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; return event_call->class->get_fields(event_call); } struct trace_subsystem_dir; enum { EVENT_FILE_FL_ENABLED_BIT, EVENT_FILE_FL_RECORDED_CMD_BIT, EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, EVENT_FILE_FL_SOFT_MODE_BIT, EVENT_FILE_FL_SOFT_DISABLED_BIT, EVENT_FILE_FL_TRIGGER_MODE_BIT, EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, }; extern struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event); extern void trace_put_event_file(struct trace_event_file *file); #define MAX_DYNEVENT_CMD_LEN (2048) enum dynevent_type { DYNEVENT_TYPE_SYNTH = 1, DYNEVENT_TYPE_KPROBE, DYNEVENT_TYPE_NONE, }; struct dynevent_cmd; typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd); struct dynevent_cmd { struct seq_buf seq; const char *event_name; unsigned int n_fields; enum dynevent_type type; dynevent_create_fn_t run_command; void *private_data; }; extern int dynevent_create(struct dynevent_cmd *cmd); extern int synth_event_delete(const char *name); extern void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); extern int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, ...); #define synth_event_gen_cmd_start(cmd, name, mod, ...) \ __synth_event_gen_cmd_start(cmd, name, mod, ## __VA_ARGS__, NULL) struct synth_field_desc { const char *type; const char *name; }; extern int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, struct synth_field_desc *fields, unsigned int n_fields); extern int synth_event_create(const char *name, struct synth_field_desc *fields, unsigned int n_fields, struct module *mod); extern int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, const char *name); extern int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name); extern int synth_event_add_fields(struct dynevent_cmd *cmd, struct synth_field_desc *fields, unsigned int n_fields); #define synth_event_gen_cmd_end(cmd) \ dynevent_create(cmd) struct synth_event; struct synth_event_trace_state { struct trace_event_buffer fbuffer; struct synth_trace_event *entry; struct trace_buffer *buffer; struct synth_event *event; unsigned int cur_field; unsigned int n_u64; bool disabled; bool add_next; bool add_name; }; extern int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...); extern int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals); extern int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state); extern int synth_event_add_next_val(u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_add_val(const char *field_name, u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_trace_end(struct synth_event_trace_state *trace_state); extern int kprobe_event_delete(const char *name); extern void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); #define kprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, false, name, loc, ## __VA_ARGS__, NULL) #define kretprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, true, name, loc, ## __VA_ARGS__, NULL) extern int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, const char *name, const char *loc, ...); #define kprobe_event_add_fields(cmd, ...) \ __kprobe_event_add_fields(cmd, ## __VA_ARGS__, NULL) #define kprobe_event_add_field(cmd, field) \ __kprobe_event_add_fields(cmd, field, NULL) extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); #define kprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) #define kretprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) /* * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED * SOFT_DISABLED - When set, do not trace the event (even though its * tracepoint may be enabled) * TRIGGER_MODE - When set, invoke the triggers associated with the event * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), }; struct trace_event_file { struct list_head list; struct trace_event_call *event_call; struct event_filter __rcu *filter; struct eventfs_file *ef; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; /* * 32 bit flags: * bit 0: enabled * bit 1: enabled cmd record * bit 2: enable/disable with the soft disable bit * bit 3: soft disabled * bit 4: trigger enabled * * Note: The bits must be set atomically to prevent races * from other writers. Reads of flags do not need to be in * sync as they occur in critical sections. But the way flags * is currently used, these changes do not affect the code * except that when a change is made, it may have a slight * delay in propagating the changes to other CPUs due to * caching and such. Which is mostly OK ;-) */ unsigned long flags; atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ event_##name.flags |= value; \ return 0; \ } \ early_initcall(trace_init_flags_##name); #define __TRACE_EVENT_PERF_PERM(name, expr...) \ static int perf_perm_##name(struct trace_event_call *tp_event, \ struct perf_event *p_event) \ { \ return ({ expr; }); \ } \ static int __init trace_init_perf_perm_##name(void) \ { \ event_##name.perf_perm = &perf_perm_##name; \ return 0; \ } \ early_initcall(trace_init_perf_perm_##name); #define PERF_MAX_TRACE_SIZE 8192 #define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), ETT_TRACE_ONOFF = (1 << 0), ETT_SNAPSHOT = (1 << 1), ETT_STACKTRACE = (1 << 2), ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), ETT_EVENT_EPROBE = (1 << 6), }; extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); bool __trace_trigger_soft_disabled(struct trace_event_file *file); /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test * * If any triggers without filters are attached to this event, they * will be called here. If the event is soft disabled and has no * triggers that require testing the fields, it will return true, * otherwise false. */ static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_PID_FILTER)))) return false; if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) return false; return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { return 1; } static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } static inline int perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) { return -EOPNOTSUPP; } static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) { return -EOPNOTSUPP; } static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { } static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr) { return -EOPNOTSUPP; } static inline int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } #endif enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_RDYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_CPUMASK, FILTER_COMM, FILTER_CPU, FILTER_STACKTRACE, }; extern int trace_event_raw_init(struct trace_event_call *call); extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a * constant. Even with the outer if statement optimizing out. */ #define event_trace_printk(ip, fmt, args...) \ do { \ __trace_printk_check_format(fmt, ##args); \ tracing_record_cmdline(current); \ if (__builtin_constant_p(fmt)) { \ static const char *trace_printk_fmt \ __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __trace_bprintk(ip, trace_printk_fmt, ##args); \ } else \ __trace_printk(ip, fmt, ##args); \ } while (0) #ifdef CONFIG_PERF_EVENTS struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); extern int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, unsigned long ref_ctr_offset, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3); void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4); void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task); static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, struct task_struct *task) { perf_tp_event(type, count, raw_data, size, regs, head, rctx, task); } #endif #define TRACE_EVENT_STR_MAX 512 /* * gcc warns that you can not use a va_list in an inlined * function. But lets me make it into a macro :-/ */ #define __trace_event_vstr_len(fmt, va) \ ({ \ va_list __ap; \ int __ret; \ \ va_copy(__ap, *(va)); \ __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \ va_end(__ap); \ \ min(__ret, TRACE_EVENT_STR_MAX); \ }) #endif /* _LINUX_TRACE_EVENT_H */ /* * Note: we keep the TRACE_CUSTOM_EVENT outside the include file ifdef protection. * This is due to the way trace custom events work. If a file includes two * trace event headers under one "CREATE_CUSTOM_TRACE_EVENTS" the first include * will override the TRACE_CUSTOM_EVENT and break the second include. */ #ifndef TRACE_CUSTOM_EVENT #define DECLARE_CUSTOM_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_CUSTOM_EVENT(template, name, proto, args) #define TRACE_CUSTOM_EVENT(name, proto, args, struct, assign, print) #endif /* ifdef TRACE_CUSTOM_EVENT (see note above) */
1637 607 607 606 607 54 104 103 50 44 104 104 1615 100 1632 100 100 100 1633 2533 2533 820 820 2533 1 1296 1299 1295 1138 1141 1141 27 27 26 1124 1050 1049 899 1049 1049 191 900 900 191 292 1123 1194 966 22 1123 1194 1637 1637 1633 96 48 74 1633 1614 1436 1436 1633 8 1637 1637 1588 1587 1587 1633 1633 1189 1141 1187 1186 219 88 87 2533 2533 2533 2048 3549 3550 3550 3550 3550 3549 3497 3550 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 fs regular file handling primitives * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) */ #include <linux/time.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" /* * Returns %true if the given DIO request should be attempted with DIO, or * %false if it should fall back to buffered I/O. * * DIO isn't well specified; when it's unsupported (either due to the request * being misaligned, or due to the file not supporting DIO at all), filesystems * either fall back to buffered I/O or return EINVAL. For files that don't use * any special features like encryption or verity, ext4 has traditionally * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. * In this case, we should attempt the DIO, *not* fall back to buffered I/O. * * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 * traditionally falls back to buffered I/O. * * This function implements the traditional ext4 behavior in all these cases. */ static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); u32 dio_align = ext4_dio_alignment(inode); if (dio_align == 0) return false; if (dio_align == 1) return true; return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore */ if (!IS_DAX(inode)) { inode_unlock_shared(inode); /* Fallback to buffered IO in case we cannot support DAX */ return generic_file_read_iter(iocb, to); } ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #endif static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ static int ext4_release_file(struct inode *inode, struct file *filp) { if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static bool ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) return true; return false; } static bool ext4_extending_io(struct inode *inode, loff_t offset, size_t len) { if (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize) return true; return false; } /* Is IO overwriting allocated or initialized blocks? */ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, blklen; if (pos + len > i_size_read(inode)) return false; map.m_lblk = pos >> blkbits; map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err != blklen) return false; /* * 'err==len' means that all of the blocks have been preallocated, * regardless of whether they have been initialized or not. We need to * check m_flags to distinguish the unwritten extents. */ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } return iov_iter_count(from); } static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); if (count <= 0) return count; ret = file_modified(iocb->ki_filp); if (ret) return ret; return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = generic_perform_write(iocb, from); out: inode_unlock(inode); if (unlikely(ret <= 0)) return ret; return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, size_t count) { handle_t *handle; bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; int ret; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But, the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. */ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); if (offset + count <= EXT4_I(inode)->i_disksize) { /* * We need to ensure that the inode is removed from the orphan * list if it has been added prematurely, due to writeback of * delalloc blocks. */ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ext4_orphan_del(NULL, inode); return PTR_ERR(handle); } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } return written; } if (written < 0) goto truncate; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { written = PTR_ERR(handle); goto truncate; } if (ext4_update_inode_size(inode, offset + written)) { ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { written = ret; ext4_journal_stop(handle); goto truncate; } } /* * We may need to truncate allocated but not written blocks beyond EOF. */ written_blk = ALIGN(offset + written, 1 << blkbits); end_blk = ALIGN(offset + count, 1 << blkbits); if (written_blk < end_blk && ext4_can_truncate(inode)) truncate = true; /* * Remove the inode from the orphan list if it has been extended and * everything went OK. */ if (!truncate && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); if (truncate) { truncate: ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return written; } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (error) return error; if (size && flags & IOMAP_DIO_UNWRITTEN) { error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error < 0) return error; } /* * If we are extending the file, we have to update i_size here before * page cache gets invalidated in iomap_dio_rw(). Otherwise racing * buffered reads could zero out too much from page cache pages. Update * of on-disk size will happen later in ext4_dio_write_iter() where * we have enough information to also perform orphan list handling etc. * Note that we perform all extending writes synchronously under * i_rwsem held exclusively so i_size update is safe here in that case. * If the write was not extending, we cannot see pos > i_size here * because operations reducing i_size like truncate wait for all * outstanding DIO before updating i_size. */ pos += size; if (pos > i_size_read(inode)) i_size_write(inode, pos); return 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the * whole operation by releasing the shared lock and acquiring exclusive lock. * * - For unaligned_io we never take shared lock as it may cause data corruption * when two unaligned IO tries to modify the same block e.g. while zeroing. * * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including * initialized blocks and unwritten blocks. For overwrite unwritten blocks * we protect splitting extents by i_data_sem in ext4_inode_info, so we can * also release exclusive i_rwsem lock. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = ret; unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); overwrite = ext4_overwrite_io(inode, offset, count, unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). * * Note that unaligned writes are allowed under shared lock so long as * they are pure overwrites. Otherwise, concurrent unaligned writes risk * data corruption due to partial block zeroing in the dio layer, and so * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); goto restart; } /* * Now that locking is settled, determine dio flags and exclusivity * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce * behavior already. The inode lock is already held exclusive if the * write is non-overwrite or extending, so drain all outstanding dio and * set the force wait dio flag. */ if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } if (unaligned_io && (!overwrite || *unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } ret = file_modified(file); if (ret < 0) goto out; return count; out: if (*ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ret; } static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unwritten = false; bool ilock_shared = true; int dio_flags = 0; /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. */ if (offset + count > i_size_read(inode)) ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { if (ilock_shared) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { if (!inode_trylock(inode)) return -EAGAIN; } } else { if (ilock_shared) inode_lock_shared(inode); else inode_lock(inode); } /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; /* * Make sure inline data cannot be created anymore since we are going * to allocate blocks for DIO. We know the inode does not have any * inline data now because ext4_dio_supported() checked for that. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); offset = iocb->ki_pos; count = ret; if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } ext4_journal_stop(handle); } if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; /* * We have to distinguish real writes from writes which will result in a * COW page; COW writes should *not* poke the journal (the file will not * be changed). Doing so would cause unintended failures when mounted * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; pfn_t pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); if ((result & VM_FAULT_ERROR) && error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { filemap_invalidate_unlock_shared(mapping); } return result; } static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } return 0; } static int ext4_sample_last_mounted(struct super_block *sb, struct vfsmount *mnt) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; handle_t *handle; int err; if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience * when trying to sort through large numbers of block * devices or filesystem images. */ memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); err = 0; if (IS_ERR(cp)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); err = PTR_ERR(handle); if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); strncpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: sb_end_intwrite(sb); return err; } static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) return ret; ret = fscrypt_file_open(inode, filp); if (ret) return ret; ret = fsverity_file_open(inode, filp); if (ret) return ret; /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_DIO_PARALLEL_WRITE; return dquot_file_open(inode, filp); } /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; else maxbytes = inode->i_sb->s_maxbytes; switch (whence) { default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, };
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1D Generic Attribute Registration Protocol (GARP) * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/garp.h> #include <asm/unaligned.h> static unsigned int garp_join_time __read_mostly = 200; module_param(garp_join_time, uint, 0644); MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)"); MODULE_LICENSE("GPL"); static const struct garp_state_trans { u8 state; u8 action; } garp_applicant_state_table[GARP_APPLICANT_MAX + 1][GARP_EVENT_MAX + 1] = { [GARP_APPLICANT_VA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, }, [GARP_APPLICANT_AA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, }, [GARP_APPLICANT_QA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, }, [GARP_APPLICANT_LA] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_VO, .action = GARP_ACTION_S_LEAVE_EMPTY }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_LA }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_LA }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_LA }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VA }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, [GARP_APPLICANT_VP] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AP }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_VO }, }, [GARP_APPLICANT_AP] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA, .action = GARP_ACTION_S_JOIN_IN }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_AO }, }, [GARP_APPLICANT_QP] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_QO }, }, [GARP_APPLICANT_VO] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AO }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VP }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, [GARP_APPLICANT_AO] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_AP }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, [GARP_APPLICANT_QO] = { [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO }, [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_QP }, [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, }, }; static int garp_attr_cmp(const struct garp_attr *attr, const void *data, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->dlen != len) return attr->dlen - len; return memcmp(attr->data, data, len); } static struct garp_attr *garp_attr_lookup(const struct garp_applicant *app, const void *data, u8 len, u8 type) { struct rb_node *parent = app->gid.rb_node; struct garp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct garp_attr, node); d = garp_attr_cmp(attr, data, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct garp_attr *garp_attr_create(struct garp_applicant *app, const void *data, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->gid.rb_node; struct garp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct garp_attr, node); d = garp_attr_cmp(attr, data, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = GARP_APPLICANT_VO; attr->type = type; attr->dlen = len; memcpy(attr->data, data, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->gid); return attr; } static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr) { rb_erase(&attr->node, &app->gid); kfree(attr); } static void garp_attr_destroy_all(struct garp_applicant *app) { struct rb_node *node, *next; struct garp_attr *attr; for (node = rb_first(&app->gid); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct garp_attr, node); garp_attr_destroy(app, attr); } } static int garp_pdu_init(struct garp_applicant *app) { struct sk_buff *skb; struct garp_pdu_hdr *gp; #define LLC_RESERVE sizeof(struct llc_pdu_un) skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = htons(ETH_P_802_2); skb_reserve(skb, LL_RESERVED_SPACE(app->dev) + LLC_RESERVE); gp = __skb_put(skb, sizeof(*gp)); put_unaligned(htons(GARP_PROTOCOL_ID), &gp->protocol); app->pdu = skb; return 0; } static int garp_pdu_append_end_mark(struct garp_applicant *app) { if (skb_tailroom(app->pdu) < sizeof(u8)) return -1; __skb_put_u8(app->pdu, GARP_END_MARK); return 0; } static void garp_pdu_queue(struct garp_applicant *app) { if (!app->pdu) return; garp_pdu_append_end_mark(app); garp_pdu_append_end_mark(app); llc_pdu_header_init(app->pdu, LLC_PDU_TYPE_U, LLC_SAP_BSPAN, LLC_SAP_BSPAN, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(app->pdu); llc_mac_hdr_init(app->pdu, app->dev->dev_addr, app->app->proto.group_address); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void garp_queue_xmit(struct garp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int garp_pdu_append_msg(struct garp_applicant *app, u8 attrtype) { struct garp_msg_hdr *gm; if (skb_tailroom(app->pdu) < sizeof(*gm)) return -1; gm = __skb_put(app->pdu, sizeof(*gm)); gm->attrtype = attrtype; garp_cb(app->pdu)->cur_type = attrtype; return 0; } static int garp_pdu_append_attr(struct garp_applicant *app, const struct garp_attr *attr, enum garp_attr_event event) { struct garp_attr_hdr *ga; unsigned int len; int err; again: if (!app->pdu) { err = garp_pdu_init(app); if (err < 0) return err; } if (garp_cb(app->pdu)->cur_type != attr->type) { if (garp_cb(app->pdu)->cur_type && garp_pdu_append_end_mark(app) < 0) goto queue; if (garp_pdu_append_msg(app, attr->type) < 0) goto queue; } len = sizeof(*ga) + attr->dlen; if (skb_tailroom(app->pdu) < len) goto queue; ga = __skb_put(app->pdu, len); ga->len = len; ga->event = event; memcpy(ga->data, attr->data, attr->dlen); return 0; queue: garp_pdu_queue(app); goto again; } static void garp_attr_event(struct garp_applicant *app, struct garp_attr *attr, enum garp_event event) { enum garp_applicant_state state; state = garp_applicant_state_table[attr->state][event].state; if (state == GARP_APPLICANT_INVALID) return; switch (garp_applicant_state_table[attr->state][event].action) { case GARP_ACTION_NONE: break; case GARP_ACTION_S_JOIN_IN: /* When appending the attribute fails, don't update state in * order to retry on next TRANSMIT_PDU event. */ if (garp_pdu_append_attr(app, attr, GARP_JOIN_IN) < 0) return; break; case GARP_ACTION_S_LEAVE_EMPTY: garp_pdu_append_attr(app, attr, GARP_LEAVE_EMPTY); /* As a pure applicant, sending a leave message implies that * the attribute was unregistered and can be destroyed. */ garp_attr_destroy(app, attr); return; default: WARN_ON(1); } attr->state = state; } int garp_request_join(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { struct garp_port *port = rtnl_dereference(dev->garp_port); struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); attr = garp_attr_create(app, data, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } garp_attr_event(app, attr, GARP_EVENT_REQ_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(garp_request_join); void garp_request_leave(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { struct garp_port *port = rtnl_dereference(dev->garp_port); struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); attr = garp_attr_lookup(app, data, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } garp_attr_event(app, attr, GARP_EVENT_REQ_LEAVE); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(garp_request_leave); static void garp_gid_event(struct garp_applicant *app, enum garp_event event) { struct rb_node *node, *next; struct garp_attr *attr; for (node = rb_first(&app->gid); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct garp_attr, node); garp_attr_event(app, attr, event); } } static void garp_join_timer_arm(struct garp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(garp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void garp_join_timer(struct timer_list *t) { struct garp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); garp_pdu_queue(app); spin_unlock(&app->lock); garp_queue_xmit(app); garp_join_timer_arm(app); } static int garp_pdu_parse_end_mark(struct sk_buff *skb) { if (!pskb_may_pull(skb, sizeof(u8))) return -1; if (*skb->data == GARP_END_MARK) { skb_pull(skb, sizeof(u8)); return -1; } return 0; } static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb, u8 attrtype) { const struct garp_attr_hdr *ga; struct garp_attr *attr; enum garp_event event; unsigned int dlen; if (!pskb_may_pull(skb, sizeof(*ga))) return -1; ga = (struct garp_attr_hdr *)skb->data; if (ga->len < sizeof(*ga)) return -1; if (!pskb_may_pull(skb, ga->len)) return -1; skb_pull(skb, ga->len); dlen = sizeof(*ga) - ga->len; if (attrtype > app->app->maxattr) return 0; switch (ga->event) { case GARP_LEAVE_ALL: if (dlen != 0) return -1; garp_gid_event(app, GARP_EVENT_R_LEAVE_EMPTY); return 0; case GARP_JOIN_EMPTY: event = GARP_EVENT_R_JOIN_EMPTY; break; case GARP_JOIN_IN: event = GARP_EVENT_R_JOIN_IN; break; case GARP_LEAVE_EMPTY: event = GARP_EVENT_R_LEAVE_EMPTY; break; case GARP_EMPTY: event = GARP_EVENT_R_EMPTY; break; default: return 0; } if (dlen == 0) return -1; attr = garp_attr_lookup(app, ga->data, dlen, attrtype); if (attr == NULL) return 0; garp_attr_event(app, attr, event); return 0; } static int garp_pdu_parse_msg(struct garp_applicant *app, struct sk_buff *skb) { const struct garp_msg_hdr *gm; if (!pskb_may_pull(skb, sizeof(*gm))) return -1; gm = (struct garp_msg_hdr *)skb->data; if (gm->attrtype == 0) return -1; skb_pull(skb, sizeof(*gm)); while (skb->len > 0) { if (garp_pdu_parse_attr(app, skb, gm->attrtype) < 0) return -1; if (garp_pdu_parse_end_mark(skb) < 0) break; } return 0; } static void garp_pdu_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev) { struct garp_application *appl = proto->data; struct garp_port *port; struct garp_applicant *app; const struct garp_pdu_hdr *gp; port = rcu_dereference(dev->garp_port); if (!port) goto err; app = rcu_dereference(port->applicants[appl->type]); if (!app) goto err; if (!pskb_may_pull(skb, sizeof(*gp))) goto err; gp = (struct garp_pdu_hdr *)skb->data; if (get_unaligned(&gp->protocol) != htons(GARP_PROTOCOL_ID)) goto err; skb_pull(skb, sizeof(*gp)); spin_lock(&app->lock); while (skb->len > 0) { if (garp_pdu_parse_msg(app, skb) < 0) break; if (garp_pdu_parse_end_mark(skb) < 0) break; } spin_unlock(&app->lock); err: kfree_skb(skb); } static int garp_init_port(struct net_device *dev) { struct garp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->garp_port, port); return 0; } static void garp_release_port(struct net_device *dev) { struct garp_port *port = rtnl_dereference(dev->garp_port); unsigned int i; for (i = 0; i <= GARP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->garp_port, NULL); kfree_rcu(port, rcu); } int garp_init_applicant(struct net_device *dev, struct garp_application *appl) { struct garp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->garp_port)) { err = garp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->proto.group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->gid = RB_ROOT; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->garp_port->applicants[appl->type], app); timer_setup(&app->join_timer, garp_join_timer, 0); garp_join_timer_arm(app); return 0; err3: kfree(app); err2: garp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(garp_init_applicant); void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl) { struct garp_port *port = rtnl_dereference(dev->garp_port); struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); /* Delete timer and generate a final TRANSMIT_PDU event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); garp_attr_destroy_all(app); garp_pdu_queue(app); spin_unlock_bh(&app->lock); garp_queue_xmit(app); dev_mc_del(dev, appl->proto.group_address); kfree_rcu(app, rcu); garp_release_port(dev); } EXPORT_SYMBOL_GPL(garp_uninit_applicant); int garp_register_application(struct garp_application *appl) { appl->proto.rcv = garp_pdu_rcv; appl->proto.data = appl; return stp_proto_register(&appl->proto); } EXPORT_SYMBOL_GPL(garp_register_application); void garp_unregister_application(struct garp_application *appl) { stp_proto_unregister(&appl->proto); } EXPORT_SYMBOL_GPL(garp_unregister_application);
4250 1057 2754 16682 17735 18390 18396 17738 17738 6834 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_ORDER 10 #else #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } /* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. */ static inline bool migratetype_is_mergeable(int mt) { return mt < MIGRATE_PCPTYPES; } #define for_each_migratetype_order(order, type) \ for (order = 0; order <= MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) #define folio_migratetype(folio) \ get_pfnblock_flags_mask(&folio->page, folio_pfn(folio), \ MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; struct pglist_data; #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_EVENT_ITEMS }; #else #define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, #ifdef CONFIG_UNACCEPTED_MEMORY NR_UNACCEPTED, #endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */ #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages). */ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || item == NR_SHMEM_PMDMAPPED || item == NR_FILE_PMDMAPPED; } /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define WORKINGSET_ANON 0 #define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd). */ LRUVEC_CGROUP_CONGESTED, LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ /* * Evictable pages are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the * eviction. The first check takes care of the accessed bit set on the initial * fault; the second check makes sure this page hasn't been used since then. * This process, AKA second chance, requires a minimum of two generations, * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive * LRU, e.g., /proc/vmstat, these two generations are considered active; the * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * * PG_active is always cleared while a page is on one of lrugen->folios[] so * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* * Each generation is divided into multiple tiers. A page accessed N times * through file descriptors is in tier order_base_2(N). A page in the first tier * (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A page in any other tier (N>1) is marked by * PG_referenced and PG_workingset. This implies a minimum of two tiers is * supported without using additional bits in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the * rest infer whether pages accessed multiple times through file descriptors * are statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H struct lruvec; struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) #ifdef CONFIG_LRU_GEN enum { LRU_GEN_ANON, LRU_GEN_FILE, }; enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, NR_LRU_GEN_CAPS }; #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) /* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types as clean file pages * can be evicted regardless of swap constraints. * * Normally anon and file min_seq are in sync. But if swapping is constrained, * e.g., out of swap space, file min_seq is allowed to advance and leave anon * min_seq behind. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* the first tier doesn't need protection, hence the minus one */ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; #ifdef CONFIG_MEMCG /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; #endif }; enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_OLD, /* old leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ MM_NONLEAF_TOTAL, /* total non-leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS }; /* double-buffering Bloom filters */ #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { /* set to max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; bool can_swap; bool force_scan; }; void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim an memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim an memcg below reclaimable size threshold, * which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim an memcg below reclaimable size threshold, * which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. * * Note that memcg LRU only applies to global reclaim, and the round-robin * incrementing of their max_seq counters ensures the eventual fairness to all * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). */ #define MEMCG_NR_GENS 2 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { /* the per-node memcg generation counter */ unsigned long seq; /* each memcg has one lru_gen_folio per node */ unsigned long nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */ spinlock_t lock; }; void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_MEMCG */ #define MEMCG_NR_GENS 1 struct lru_gen_memcg { }; static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } #endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { } #ifdef CONFIG_MEMCG static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } #endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif }; /* Isolate unmapped pages */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, WMARK_PROMO, NR_WMARK }; /* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list * for THP which will usually be GFP_MOVABLE. Even if it is another type, * it should not contribute to serious fragmentation causing THP allocation * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 1 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ short free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA short expire; /* When 0, remote pagesets are drained */ #endif /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; } ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats. */ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access */ int pageset_high; int pageset_batch; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[MAX_ORDER + 1]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_ORDER */ struct list_head unaccepted_pages; #endif /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA return zone->cma_pages; #else return 0; #endif } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } #ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) { return page_zonenum(&folio->page); } #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap. */ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { if (is_zone_device_page(a) != is_zone_device_page(b)) return false; if (!is_zone_device_page(a)) return true; return a->pgmap == b->pgmap; } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { return false; } static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { return true; } #endif static inline bool folio_is_zone_device(const struct folio *folio) { return is_zone_device_page(&folio->page); } static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } static inline bool folio_is_zone_movable(const struct folio *folio) { return folio_zonenum(folio) == ZONE_MOVABLE; } #endif /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; /* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that. */ extern struct page *mem_map; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. */ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison. */ unsigned long total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. */ unsigned long ignored; unsigned long failed; unsigned long delayed; unsigned long recovered; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; /* workqueues for throttling reclaim for different reasons. */ wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; /* promote threshold in ms */ unsigned int nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif #ifdef CONFIG_MEMORY_FAILURE struct memory_failure_stats mf_stats; #endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE static inline bool zone_is_zone_device(struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else static inline bool zone_is_zone_device(struct zone *zone) { return false; } #endif /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA static inline int zone_to_nid(struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void); #else static inline bool has_managed_dma(void) { return false; } #endif #ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } #else /* CONFIG_NUMA */ #include <asm/mmzone.h> #endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = z->zone; \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) /* Whether the 'nodes' are all movable nodes */ static inline bool movable_only_nodes(nodemask_t *nodes) { struct zonelist *zonelist; struct zoneref *z; int nid; if (nodes_empty(*nodes)) return false; /* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations. */ nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!z->zone) ? true : false; } #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { unsigned long root = SECTION_NR_TO_ROOT(nr); if (unlikely(root >= NR_SECTION_ROOTS)) return NULL; #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section || !mem_section[root]) return NULL; #endif return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc. */ enum { SECTION_MARKED_PRESENT_BIT, SECTION_HAS_MEM_MAP_BIT, SECTION_IS_ONLINE_BIT, SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif SECTION_MAP_LAST_BIT, }; #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else static inline int online_device_section(struct mem_section *section) { return 0; } #endif static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); return test_bit(idx, ms->usage->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } #endif #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN * @pfn: the page frame number to check * * Check if there is a valid memory map entry aka struct page for the @pfn. * Note, that availability of the memory map entry does not imply that * there is actual usable memory at that @pfn. The struct page may * represent a hole or an unusable page frame. * * Return: 1 for PFNs that have memory map entries and 0 otherwise */ static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; /* * Ensure the upper PAGE_SHIFT bits are clear in the * pfn. Else it might lead to false positives when * some of the upper bits are set, but the lower bits * match a valid pfn. */ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) return 0; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); if (!valid_section(ms)) return 0; /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ return early_section(ms) || pfn_section_valid(ms, pfn); } #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
11 2 363 363 363 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0 */ /* * INETPEER - A storage for permanent information about peers * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #ifndef _NET_INETPEER_H #define _NET_INETPEER_H #include <linux/types.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/rtnetlink.h> #include <net/ipv6.h> #include <linux/atomic.h> /* IPv4 address key for cache lookups */ struct ipv4_addr_key { __be32 addr; int vif; }; #define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) struct inetpeer_addr { union { struct ipv4_addr_key a4; struct in6_addr a6; u32 key[INETPEER_MAXKEYSZ]; }; __u16 family; }; struct inet_peer { struct rb_node rb_node; struct inetpeer_addr daddr; u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ u32 n_redirects; unsigned long rate_last; /* * Once inet_peer is queued for deletion (refcnt == 0), following field * is not available: rid * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ }; struct rcu_head rcu; }; /* following fields might be frequently dirtied */ __u32 dtime; /* the time of last use of not referenced entries */ refcount_t refcnt; }; struct inet_peer_base { struct rb_root rb_root; seqlock_t lock; int total; }; void inet_peer_base_init(struct inet_peer_base *); void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { iaddr->a4.addr = ip; iaddr->a4.vif = 0; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { return iaddr->a4.addr; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, struct in6_addr *in6) { iaddr->a6 = *in6; iaddr->family = AF_INET6; } static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) { return &iaddr->a6; } /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, int vif, int create) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, const struct in6_addr *v6daddr, int create) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; return inet_getpeer(base, &daddr, create); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { int i, n; if (a->family == AF_INET) n = sizeof(a->a4) / sizeof(u32); else n = sizeof(a->a6) / sizeof(u32); for (i = 0; i < n; i++) { if (a->key[i] == b->key[i]) continue; if (a->key[i] < b->key[i]) return -1; return 1; } return 0; } /* can be called from BH context or outside */ void inet_putpeer(struct inet_peer *p); bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); #endif /* _NET_INETPEER_H */
3143 577 530 3015 3143 786 786 800 530 1944 164 1785 530 530 530 530 530 47 530 81 81 81 270 269 164 1785 1515 270 33 269 1944 1944 2898 2746 2333 1416 3143 1944 81 591 591 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 // SPDX-License-Identifier: GPL-2.0 /* * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: * * - Monitor latencies in a defined window of time. * - If the minimum latency in the above window exceeds some target, increment * scaling step and scale down queue depth by a factor of 2x. The monitoring * window is then shrunk to 100 / sqrt(scaling step + 1). * - For any window where we don't have solid data on what the latencies * look like, retain status quo. * - If latencies look good, decrement scaling step. * - If we're only doing writes, allow the scaling step to go negative. This * will temporarily boost write performance, snapping back to a stable * scaling step of 0 if reads show up or the heavy writers finish. Unlike * positive scaling steps where we shrink the monitoring window, a negative * scaling step retains the default step==0 window size. * * Copyright (C) 2016 Jens Axboe * */ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/swap.h> #include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ WBT_KSWAPD = 4, /* write, from kswapd */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ }; enum { WBT_RWQ_BG = 0, WBT_RWQ_KSWAPD, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; /* * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered * to WBT_STATE_OFF/ON_MANUAL. */ enum { WBT_STATE_ON_DEFAULT = 1, /* on by default */ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ WBT_STATE_OFF_DEFAULT = 3, /* off by default */ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ }; struct rq_wb { /* * Settings that govern how we throttle */ unsigned int wb_background; /* background writeback */ unsigned int wb_normal; /* normal writeback */ short enable_state; /* WBT_STATE_* */ /* * Number of consecutive periods where we don't have enough * information to make a firm scale up/down decision. */ unsigned int unknown_cnt; u64 win_nsec; /* default window size */ u64 cur_win_nsec; /* current window size */ struct blk_stat_callback *cb; u64 sync_issue; void *sync_cookie; unsigned int wc; unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; struct rq_depth rq_depth; }; static inline struct rq_wb *RQWB(struct rq_qos *rqos) { return container_of(rqos, struct rq_wb, rqos); } static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; } static inline enum wbt_flags wbt_flags(struct request *rq) { return rq->wbt_flags; } static inline bool wbt_is_tracked(struct request *rq) { return rq->wbt_flags & WBT_TRACKED; } static inline bool wbt_is_read(struct request *rq) { return rq->wbt_flags & WBT_READ; } enum { /* * Default setting, we'll scale up (to 75% of QD max) or down (min 1) * from here depending on device stats */ RWB_DEF_DEPTH = 16, /* * 100msec window */ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, /* * Disregard stats, if we don't meet this minimum */ RWB_MIN_WRITE_SAMPLES = 3, /* * If we have this number of consecutive windows with not enough * information to scale up or down, scale up. */ RWB_UNKNOWN_BUMP = 5, }; static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) { if (rwb_enabled(rwb)) { const unsigned long cur = jiffies; if (cur != *var) *var = cur; } } /* * If a task was rate throttled in balance_dirty_pages() within the last * second or so, use that to indicate a higher cleaning rate. */ static bool wb_recent_wait(struct rq_wb *rwb) { struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { if (wb_acct & WBT_KSWAPD) return &rwb->rq_wait[WBT_RWQ_KSWAPD]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; return &rwb->rq_wait[WBT_RWQ_BG]; } static void rwb_wake_all(struct rq_wb *rwb) { int i; for (i = 0; i < WBT_NUM_RWQ; i++) { struct rq_wait *rqw = &rwb->rq_wait[i]; if (wq_has_sleeper(&rqw->wait)) wake_up_all(&rqw->wait); } } static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, enum wbt_flags wb_acct) { int inflight, limit; inflight = atomic_dec_return(&rqw->inflight); /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up. */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; else if (rwb->wc && !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; /* * Don't wake anyone up if we are above the normal limit. */ if (inflight && inflight >= limit) return; if (wq_has_sleeper(&rqw->wait)) { int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) wake_up_all(&rqw->wait); } } static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { struct rq_wb *rwb = RQWB(rqos); struct rq_wait *rqw; if (!(wb_acct & WBT_TRACKED)) return; rqw = get_rq_wait(rwb, wb_acct); wbt_rqw_done(rwb, rqw, wb_acct); } /* * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ static void wbt_done(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { if (rwb->sync_cookie == rq) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); } wbt_clear_state(rq); } static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* * We need at least one read sample, and a minimum of * RWB_MIN_WRITE_SAMPLES. We require some write samples to know * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state. */ return (stat[READ].nr_samples >= 1 && stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); } static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { u64 now, issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; now = ktime_to_ns(ktime_get()); return now - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) { unsigned int i, ret = 0; for (i = 0; i < WBT_NUM_RWQ; i++) ret += atomic_read(&rwb->rq_wait[i].inflight); return ret; } enum { LAT_OK = 1, LAT_UNKNOWN, LAT_UNKNOWN_WRITES, LAT_EXCEEDED, }; static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* * If our stored sync issue exceeds the window size, or it * exceeds our min target AND we haven't logged any entries, * flag the latency as exceeded. wbt works off completion latencies, * but for a flooded device, a single sync IO can take a long time * to complete after being issued. If this time exceeds our * monitoring window AND we didn't see any other completions in that * window, then count that sync IO as a violation of the latency. */ thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } /* * No read/write mix, if stat isn't valid */ if (!stat_sample_valid(stat)) { /* * If we had writes in this stat window and the window is * current, we're only doing writes. If a task recently * waited or still has writes in flights, consider us doing * just writes as well. */ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; } /* * If the 'min' latency exceeds our target, step down. */ if (stat[READ].min > rwb->min_lat_nsec) { trace_wbt_lat(bdi, stat[READ].min); trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; } static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, rwb->wb_background, rwb->wb_normal, rqd->max_depth); } static void calc_wb_limits(struct rq_wb *rwb) { if (rwb->min_lat_nsec == 0) { rwb->wb_normal = rwb->wb_background = 0; } else if (rwb->rq_depth.max_depth <= 2) { rwb->wb_normal = rwb->rq_depth.max_depth; rwb->wb_background = 1; } else { rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; } } static void scale_up(struct rq_wb *rwb) { if (!rq_depth_scale_up(&rwb->rq_depth)) return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_wake_all(rwb); rwb_trace_step(rwb, tracepoint_string("scale up")); } static void scale_down(struct rq_wb *rwb, bool hard_throttle) { if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_trace_step(rwb, tracepoint_string("scale down")); } static void rwb_arm_timer(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do * this for every window expiration, it's not a huge deal, * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the * window size. */ rwb->cur_win_nsec = rwb->win_nsec; } blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); } static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; if (!rwb->rqos.disk) return; status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, * step one level up. If we don't know enough to say either exceeded * or ok, then don't do anything. */ switch (status) { case LAT_EXCEEDED: scale_down(rwb, true); break; case LAT_OK: scale_up(rwb); break; case LAT_UNKNOWN_WRITES: /* * We started a the center step, but don't have a valid * read/write sample, but we do have writes going on. * Allow step to go negative, to increase write perf. */ scale_up(rwb); break; case LAT_UNKNOWN: if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) break; /* * We get here when previously scaled reduced depth, and we * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ if (rqd->scale_step > 0) scale_up(rwb); else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: break; } /* * Re-arm timer, if we have IO in flight */ if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } static void wbt_update_limits(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; rqd->scale_step = 0; rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return 0; return RQWB(rqos)->min_lat_nsec; } void wbt_set_min_lat(struct request_queue *q, u64 val) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; RQWB(rqos)->min_lat_nsec = val; if (val) RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; else RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; wbt_update_limits(RQWB(rqos)); } static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; return time_before(now, rwb->last_issue + HZ / 10) || time_before(now, rwb->last_comp + HZ / 10); } #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; /* * At this point we know it's a buffered write. If this is * kswapd trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback. */ limit = rwb->wb_background; } else limit = rwb->wb_normal; return limit; } struct wbt_wait_data { struct rq_wb *rwb; enum wbt_flags wb_acct; blk_opf_t opf; }; static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); } static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; wbt_rqw_done(data->rwb, rqw, data->wb_acct); } /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, blk_opf_t opf) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .rwb = rwb, .wb_acct = wb_acct, .opf = opf, }; rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: /* * Don't throttle WRITE_ODIRECT */ if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) return false; fallthrough; case REQ_OP_DISCARD: return true; default: return false; } } static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) { enum wbt_flags flags = 0; if (!rwb_enabled(rwb)) return 0; if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; } return flags; } static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); __wbt_done(rqos, flags); } /* * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; flags = bio_to_wbt_flags(rwb, bio); if (!(flags & WBT_TRACKED)) { if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); return; } __wbt_wait(rwb, flags, bio->bi_opf); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); } static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } static void wbt_issue(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; /* * Track sync issue, in case it takes a long time to complete. Allows us * to react quicker, if a sync IO takes a long time to complete. Note * that this is just a hint. The request can go away when it completes, * so it's important we never dereference it. We only use the address to * compare with, which is why we store the sync_issue time locally. */ if (wbt_is_read(rq) && !rwb->sync_issue) { rwb->sync_cookie = rq; rwb->sync_issue = rq->io_start_time_ns; } } static void wbt_requeue(struct rq_qos *rqos, struct request *rq) { struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; if (rq == rwb->sync_cookie) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } } void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { struct rq_qos *rqos = wbt_rq_qos(q); if (rqos) RQWB(rqos)->wc = write_cache_on; } /* * Enable wbt if defaults are configured that way */ void wbt_enable_default(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); if (q->elevator && test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) enable = false; /* Throttling already enabled? */ rqos = wbt_rq_qos(q); if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) return; if (queue_is_mq(q) && enable) wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); u64 wbt_default_latency_nsec(struct request_queue *q) { /* * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ if (blk_queue_nonrot(q)) return 2000000ULL; else return 75000000ULL; } static int wbt_data_dir(const struct request *rq) { const enum req_op op = req_op(rq); if (op == REQ_OP_READ) return READ; else if (op_is_write(op)) return WRITE; /* don't account */ return -1; } static void wbt_queue_depth_changed(struct rq_qos *rqos) { RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } /* * Disable wbt, if enabled by default. */ void wbt_disable_default(struct gendisk *disk) { struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } } EXPORT_SYMBOL_GPL(wbt_disable_default); #ifdef CONFIG_BLK_DEBUG_FS static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%llu\n", rwb->cur_win_nsec); return 0; } static int wbt_enabled_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%d\n", rwb->enable_state); return 0; } static int wbt_id_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; seq_printf(m, "%u\n", rqos->id); return 0; } static int wbt_inflight_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); int i; for (i = 0; i < WBT_NUM_RWQ; i++) seq_printf(m, "%d: inflight %d\n", i, atomic_read(&rwb->rq_wait[i].inflight)); return 0; } static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%lu\n", rwb->min_lat_nsec); return 0; } static int wbt_unknown_cnt_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->unknown_cnt); return 0; } static int wbt_normal_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->wb_normal); return 0; } static int wbt_background_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct rq_wb *rwb = RQWB(rqos); seq_printf(m, "%u\n", rwb->wb_background); return 0; } static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, {"enabled", 0400, wbt_enabled_show}, {"id", 0400, wbt_id_show}, {"inflight", 0400, wbt_inflight_show}, {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, {"unknown_cnt", 0400, wbt_unknown_cnt_show}, {"wb_normal", 0400, wbt_normal_show}, {"wb_background", 0400, wbt_background_show}, {}, }; #endif static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, .requeue = wbt_requeue, .done = wbt_done, .cleanup = wbt_cleanup, .queue_depth_changed = wbt_queue_depth_changed, .exit = wbt_exit, #ifdef CONFIG_BLK_DEBUG_FS .debugfs_attrs = wbt_debugfs_attrs, #endif }; int wbt_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); if (!rwb->cb) { kfree(rwb); return -ENOMEM; } for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); rwb->rq_depth.queue_depth = blk_queue_depth(q); wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ mutex_lock(&q->rq_qos_mutex); ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) goto err_free; blk_stat_add_callback(q, rwb->cb); return 0; err_free: blk_stat_free_callback(rwb->cb); kfree(rwb); return ret; }
43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_CHECKSUM_H #define __ASM_GENERIC_CHECKSUM_H /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); #endif #ifndef csum_fold /* * Fold a partial checksum */ static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; sum = (sum & 0xffff) + (sum >> 16); sum = (sum & 0xffff) + (sum >> 16); return (__force __sum16)~sum; } #endif #ifndef csum_tcpudp_nofold /* * computes the checksum of the TCP/UDP pseudo-header * returns a 16-bit checksum, already complemented */ extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum); #endif #ifndef csum_tcpudp_magic static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } #endif /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ extern __sum16 ip_compute_csum(const void *buff, int len); #endif /* __ASM_GENERIC_CHECKSUM_H */
234 233 231 15 2 230 4 226 226 49 175 174 110 174 4 3 2 14 7 9 8 12 5 3 2 1 233 35 34 34 34 35 34 34 2 2 2 2 2 9 2 1 2 2 2 9 5 5 5 98 98 98 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP to API glue. * * Authors: see ip.c * * Fixes: * Many : Split from ip.c , see ip.c for history. * Martin Mares : TOS setting fixed. * Alan Cox : Fixed a couple of oopses in Martin's * TOS tweaks. * Mike McLagan : Routing by source */ #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp_states.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/netfilter.h> #include <linux/route.h> #include <linux/mroute.h> #include <net/inet_ecn.h> #include <net/route.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/transp_v6.h> #endif #include <net/ip_fib.h> #include <linux/errqueue.h> #include <linux/uaccess.h> #include <linux/bpfilter.h> /* * SOL_IP control messages. */ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info = *PKTINFO_SKB_CB(skb); info.ipi_addr.s_addr = ip_hdr(skb)->daddr; put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { int ttl = ip_hdr(skb)->ttl; put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); } static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) { put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) { if (IPCB(skb)->opt.optlen == 0) return; put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, ip_hdr(skb) + 1); } static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg, struct sk_buff *skb) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; if (IPCB(skb)->opt.optlen == 0) return; if (ip_options_echo(net, opt, skb)) { msg->msg_flags |= MSG_CTRUNC; return; } ip_options_undo(opt); put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); } static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb) { int val; if (IPCB(skb)->frag_max_size == 0) return; val = IPCB(skb)->frag_max_size; put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val); } static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset) { __wsum csum = skb->csum; if (skb->ip_summed != CHECKSUM_COMPLETE) return; if (offset != 0) { int tend_off = skb_transport_offset(skb) + tlen; csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0)); } put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) { char *secdata; u32 seclen, secid; int err; err = security_socket_getpeersec_dgram(NULL, skb, &secid); if (err) return; err = security_secid_to_secctx(secid, &secdata, &seclen); if (err) return; put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata); security_release_secctx(secdata, seclen); } static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { __be16 _ports[2], *ports; struct sockaddr_in sin; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (!ports) return; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); } void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset) { unsigned long flags = inet_cmsg_flags(inet_sk(sk)); if (!flags) return; /* Ordered by supposed usage frequency */ if (flags & IP_CMSG_PKTINFO) { ip_cmsg_recv_pktinfo(msg, skb); flags &= ~IP_CMSG_PKTINFO; if (!flags) return; } if (flags & IP_CMSG_TTL) { ip_cmsg_recv_ttl(msg, skb); flags &= ~IP_CMSG_TTL; if (!flags) return; } if (flags & IP_CMSG_TOS) { ip_cmsg_recv_tos(msg, skb); flags &= ~IP_CMSG_TOS; if (!flags) return; } if (flags & IP_CMSG_RECVOPTS) { ip_cmsg_recv_opts(msg, skb); flags &= ~IP_CMSG_RECVOPTS; if (!flags) return; } if (flags & IP_CMSG_RETOPTS) { ip_cmsg_recv_retopts(sock_net(sk), msg, skb); flags &= ~IP_CMSG_RETOPTS; if (!flags) return; } if (flags & IP_CMSG_PASSSEC) { ip_cmsg_recv_security(msg, skb); flags &= ~IP_CMSG_PASSSEC; if (!flags) return; } if (flags & IP_CMSG_ORIGDSTADDR) { ip_cmsg_recv_dstaddr(msg, skb); flags &= ~IP_CMSG_ORIGDSTADDR; if (!flags) return; } if (flags & IP_CMSG_CHECKSUM) ip_cmsg_recv_checksum(msg, skb, tlen, offset); if (flags & IP_CMSG_RECVFRAGSIZE) ip_cmsg_recv_fragsize(msg, skb); } EXPORT_SYMBOL(ip_cmsg_recv_offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; struct net *net = sock_net(sk); for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; #if IS_ENABLED(CONFIG_IPV6) if (allow_ipv6 && cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { struct in6_pktinfo *src_info; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) return -EINVAL; src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) return -EINVAL; if (src_info->ipi6_ifindex) ipc->oif = src_info->ipi6_ifindex; ipc->addr = src_info->ipi6_addr.s6_addr32[3]; continue; } #endif if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, KERNEL_SOCKPTR(CMSG_DATA(cmsg)), err < 40 ? err : 40); if (err) return err; break; case IP_PKTINFO: { struct in_pktinfo *info; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); if (info->ipi_ifindex) ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } case IP_TTL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) return -EINVAL; val = *(int *)CMSG_DATA(cmsg); if (val < 1 || val > 255) return -EINVAL; ipc->ttl = val; break; case IP_TOS: if (cmsg->cmsg_len == CMSG_LEN(sizeof(int))) val = *(int *)CMSG_DATA(cmsg); else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8))) val = *(u8 *)CMSG_DATA(cmsg); else return -EINVAL; if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; ipc->priority = rt_tos2priority(ipc->tos); break; case IP_PROTOCOL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) return -EINVAL; val = *(int *)CMSG_DATA(cmsg); if (val < 1 || val > 255) return -EINVAL; ipc->protocol = val; break; default: return -EINVAL; } } return 0; } static void ip_ra_destroy_rcu(struct rcu_head *head) { struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); sock_put(ra->saved_sk); kfree(ra); } int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain __rcu **rap; struct net *net = sock_net(sk); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (on && !new_ra) return -ENOMEM; mutex_lock(&net->ipv4.ra_mutex); for (rap = &net->ipv4.ra_chain; (ra = rcu_dereference_protected(*rap, lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { mutex_unlock(&net->ipv4.ra_mutex); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); mutex_unlock(&net->ipv4.ra_mutex); if (ra->destructor) ra->destructor(sk); /* * Delay sock_put(sk) and kfree(ra) after one rcu grace * period. This guarantee ip_call_ra_chain() dont need * to mess with socket refcounts. */ ra->saved_sk = sk; call_rcu(&ra->rcu, ip_ra_destroy_rcu); return 0; } } if (!new_ra) { mutex_unlock(&net->ipv4.ra_mutex); return -ENOBUFS; } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); mutex_unlock(&net->ipv4.ra_mutex); return 0; } static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr), icmp_hdr(skb)->un.reserved[1] * 4); } } void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct sock_exterr_skb *serr; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; serr->ee.ee_type = icmp_hdr(skb)->type; serr->ee.ee_code = icmp_hdr(skb)->code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - skb_network_header(skb); serr->port = port; if (skb_pull(skb, payload - skb->data)) { if (inet_test_bit(RECVERR_RFC4884, sk)) ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; } kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_icmp_error); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) { struct sock_exterr_skb *serr; struct iphdr *iph; struct sk_buff *skb; if (!inet_test_bit(RECVERR, sk)) return; skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->daddr = daddr; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = port; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: * the PKTINFO fields in skb->cb[]. Fill those in here. */ static bool ipv4_datagram_support_cmsg(const struct sock *sk, struct sk_buff *skb, int ee_origin) { struct in_pktinfo *info; if (ee_origin == SO_EE_ORIGIN_ICMP) return true; if (ee_origin == SO_EE_ORIGIN_LOCAL) return false; /* Support IP_PKTINFO on tstamp packets if requested, to correlate * timestamp with egress dev. Not possible for packets without iif * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ info = PKTINFO_SKB_CB(skb); if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || !info->ipi_ifindex) return false; info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; return true; } /* * Handle MSG_ERRQUEUE */ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } void __ip_sock_set_tos(struct sock *sk, int val) { if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= inet_sk(sk)->tos & INET_ECN_MASK; } if (inet_sk(sk)->tos != val) { inet_sk(sk)->tos = val; WRITE_ONCE(sk->sk_priority, rt_tos2priority(val)); sk_dst_reset(sk); } } void ip_sock_set_tos(struct sock *sk, int val) { lock_sock(sk); __ip_sock_set_tos(sk, val); release_sock(sk); } EXPORT_SYMBOL(ip_sock_set_tos); void ip_sock_set_freebind(struct sock *sk) { inet_set_bit(FREEBIND, sk); } EXPORT_SYMBOL(ip_sock_set_freebind); void ip_sock_set_recverr(struct sock *sk) { inet_set_bit(RECVERR, sk); } EXPORT_SYMBOL(ip_sock_set_recverr); int ip_sock_set_mtu_discover(struct sock *sk, int val) { if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) return -EINVAL; lock_sock(sk); inet_sk(sk)->pmtudisc = val; release_sock(sk); return 0; } EXPORT_SYMBOL(ip_sock_set_mtu_discover); void ip_sock_set_pktinfo(struct sock *sk) { inet_set_bit(PKTINFO, sk); } EXPORT_SYMBOL(ip_sock_set_pktinfo); /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. */ static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_MSFILTER: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: return true; } return false; } static int set_mcast_msfilter(struct sock *sk, int ifindex, int numsrc, int fmode, struct sockaddr_storage *group, struct sockaddr_storage *list) { struct ip_msfilter *msf; struct sockaddr_in *psin; int err, i; msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL); if (!msf) return -ENOBUFS; psin = (struct sockaddr_in *)group; if (psin->sin_family != AF_INET) goto Eaddrnotavail; msf->imsf_multiaddr = psin->sin_addr.s_addr; msf->imsf_interface = 0; msf->imsf_fmode = fmode; msf->imsf_numsrc = numsrc; for (i = 0; i < numsrc; ++i) { psin = (struct sockaddr_in *)&list[i]; if (psin->sin_family != AF_INET) goto Eaddrnotavail; msf->imsf_slist_flex[i] = psin->sin_addr.s_addr; } err = ip_mc_msfilter(sk, msf, ifindex); kfree(msf); return err; Eaddrnotavail: kfree(msf); return -EADDRNOTAVAIL; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen != sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen != sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; struct ip_mreq_source mreqs; struct sockaddr_in *psin; int omode, add, err; err = copy_group_source_from_sockptr(&greqs, optval, optlen); if (err) return err; if (greqs.gsr_group.ss_family != AF_INET || greqs.gsr_source.ss_family != AF_INET) return -EADDRNOTAVAIL; psin = (struct sockaddr_in *)&greqs.gsr_group; mreqs.imr_multiaddr = psin->sin_addr.s_addr; psin = (struct sockaddr_in *)&greqs.gsr_source; mreqs.imr_sourceaddr = psin->sin_addr.s_addr; mreqs.imr_interface = 0; /* use index for mc_source */ if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) return err; greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface); } static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf = NULL; int err; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffff || gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return err; } static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; unsigned int n; void *p; int err; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ err = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ n = gf32->gf_numsrc; err = -ENOBUFS; if (n >= 0x1ffffff) goto out_free_gsf; err = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, &gf32->gf_group, gf32->gf_slist_flex); out_free_gsf: kfree(p); return err; } static int ip_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ip_mreqn mreq = { }; struct sockaddr_in *psin; struct group_req greq; if (optlen < sizeof(struct group_req)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; psin = (struct sockaddr_in *)&greq.gr_group; if (psin->sin_family != AF_INET) return -EINVAL; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_ifindex = greq.gr_interface; if (optname == MCAST_JOIN_GROUP) return ip_mc_join_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq); } static int compat_ip_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req greq; struct ip_mreqn mreq = { }; struct sockaddr_in *psin; if (optlen < sizeof(struct compat_group_req)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; psin = (struct sockaddr_in *)&greq.gr_group; if (psin->sin_family != AF_INET) return -EINVAL; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_ifindex = greq.gr_interface; if (optname == MCAST_JOIN_GROUP) return ip_mc_join_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq); } DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int val = 0, err; bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { case IP_PKTINFO: case IP_RECVTTL: case IP_RECVOPTS: case IP_RECVTOS: case IP_RETOPTS: case IP_TOS: case IP_TTL: case IP_HDRINCL: case IP_MTU_DISCOVER: case IP_RECVERR: case IP_ROUTER_ALERT: case IP_FREEBIND: case IP_PASSSEC: case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: case IP_MULTICAST_LOOP: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: case IP_RECVERR_RFC4884: case IP_LOCAL_PORT_RANGE: if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) return -EFAULT; val = (int) ucval; } } /* If optlen==0, it is equivalent to val == 0 */ if (optname == IP_ROUTER_ALERT) return ip_ra_control(sk, val ? 1 : 0, NULL); if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IP_PKTINFO: inet_assign_bit(PKTINFO, sk, val); return 0; case IP_RECVTTL: inet_assign_bit(TTL, sk, val); return 0; case IP_RECVTOS: inet_assign_bit(TOS, sk, val); return 0; case IP_RECVOPTS: inet_assign_bit(RECVOPTS, sk, val); return 0; case IP_RETOPTS: inet_assign_bit(RETOPTS, sk, val); return 0; case IP_PASSSEC: inet_assign_bit(PASSSEC, sk, val); return 0; case IP_RECVORIGDSTADDR: inet_assign_bit(ORIGDSTADDR, sk, val); return 0; case IP_RECVFRAGSIZE: if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) return -EINVAL; inet_assign_bit(RECVFRAGSIZE, sk, val); return 0; case IP_RECVERR: inet_assign_bit(RECVERR, sk, val); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IP_RECVERR_RFC4884: if (val < 0 || val > 1) return -EINVAL; inet_assign_bit(RECVERR_RFC4884, sk, val); return 0; case IP_FREEBIND: if (optlen < 1) return -EINVAL; inet_assign_bit(FREEBIND, sk, val); return 0; case IP_HDRINCL: if (sk->sk_type != SOCK_RAW) return -ENOPROTOOPT; inet_assign_bit(HDRINCL, sk, val); return 0; case IP_MULTICAST_LOOP: if (optlen < 1) return -EINVAL; inet_assign_bit(MC_LOOP, sk, val); return 0; case IP_MULTICAST_ALL: if (optlen < 1) return -EINVAL; if (val != 0 && val != 1) return -EINVAL; inet_assign_bit(MC_ALL, sk, val); return 0; case IP_TRANSPARENT: if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optlen < 1) return -EINVAL; inet_assign_bit(TRANSPARENT, sk, val); return 0; case IP_NODEFRAG: if (sk->sk_type != SOCK_RAW) return -ENOPROTOOPT; inet_assign_bit(NODEFRAG, sk, val); return 0; case IP_BIND_ADDRESS_NO_PORT: inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val); return 0; case IP_TTL: if (optlen < 1) return -EINVAL; if (val != -1 && (val < 1 || val > 255)) return -EINVAL; WRITE_ONCE(inet->uc_ttl, val); return 0; case IP_MINTTL: if (optlen < 1) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip4_min_ttl); WRITE_ONCE(inet->min_ttl, val); return 0; } err = 0; if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: { struct ip_options_rcu *old, *opt = NULL; if (optlen > 40) goto e_inval; err = ip_options_get(sock_net(sk), &opt, optval, optlen); if (err) break; old = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_test_bit(IS_ICSK, sk)) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet->inet_daddr != LOOPBACK4_IPV6)) { #endif if (old) icsk->icsk_ext_hdr_len -= old->opt.optlen; if (opt) icsk->icsk_ext_hdr_len += opt->opt.optlen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if IS_ENABLED(CONFIG_IPV6) } #endif } rcu_assign_pointer(inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); break; } case IP_CHECKSUM: if (val) { if (!(inet_test_bit(CHECKSUM, sk))) { inet_inc_convert_csum(sk); inet_set_bit(CHECKSUM, sk); } } else { if (inet_test_bit(CHECKSUM, sk)) { inet_dec_convert_csum(sk); inet_clear_bit(CHECKSUM, sk); } } break; case IP_TOS: /* This sets both TOS and Precedence */ __ip_sock_set_tos(sk, val); break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; inet->pmtudisc = val; break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; if (optlen < 1) goto e_inval; if (val == -1) val = 1; if (val < 0 || val > 255) goto e_inval; inet->mc_ttl = val; break; case IP_UNICAST_IF: { struct net_device *dev = NULL; int ifindex; int midx; if (optlen != sizeof(int)) goto e_inval; ifindex = (__force int)ntohl((__force __be32)val); if (ifindex == 0) { inet->uc_index = 0; err = 0; break; } dev = dev_get_by_index(sock_net(sk), ifindex); err = -EADDRNOTAVAIL; if (!dev) break; midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; inet->uc_index = ifindex; err = 0; break; } case IP_MULTICAST_IF: { struct ip_mreqn mreq; struct net_device *dev = NULL; int midx; if (sk->sk_type == SOCK_STREAM) goto e_inval; /* * Check the arguments are allowable */ if (optlen < sizeof(struct in_addr)) goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct ip_mreq)) { if (copy_from_sockptr(&mreq, optval, sizeof(struct ip_mreq))) break; } else if (optlen >= sizeof(struct in_addr)) { if (copy_from_sockptr(&mreq.imr_address, optval, sizeof(struct in_addr))) break; } } if (!mreq.imr_ifindex) { if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { inet->mc_index = 0; inet->mc_addr = 0; err = 0; break; } dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); if (dev) mreq.imr_ifindex = dev->ifindex; } else dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); err = -EADDRNOTAVAIL; if (!dev) break; midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; inet->mc_index = mreq.imr_ifindex; inet->mc_addr = mreq.imr_address.s_addr; err = 0; break; } case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: { struct ip_mreqn mreq; err = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; if (optlen < sizeof(struct ip_mreq)) goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (copy_from_sockptr(&mreq, optval, sizeof(struct ip_mreq))) break; } if (optname == IP_ADD_MEMBERSHIP) err = ip_mc_join_group(sk, &mreq); else err = ip_mc_leave_group(sk, &mreq); break; } case IP_MSFILTER: { struct ip_msfilter *msf; if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; if (optlen > READ_ONCE(sysctl_optmem_max)) { err = -ENOBUFS; break; } msf = memdup_sockptr(optval, optlen); if (IS_ERR(msf)) { err = PTR_ERR(msf); break; } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { kfree(msf); err = -ENOBUFS; break; } if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { kfree(msf); err = -EINVAL; break; } err = ip_mc_msfilter(sk, msf, 0); kfree(msf); break; } case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; int omode, add; if (optlen != sizeof(struct ip_mreq_source)) goto e_inval; if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; break; } if (optname == IP_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == IP_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { struct ip_mreqn mreq; mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; add = 1; } else /* IP_DROP_SOURCE_MEMBERSHIP */ { omode = MCAST_INCLUDE; add = 0; } err = ip_mc_source(add, omode, sk, &mreqs, 0); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) err = compat_ip_mcast_join_leave(sk, optname, optval, optlen); else err = ip_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: err = do_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) err = compat_ip_set_mcast_msfilter(sk, optval, optlen); else err = ip_set_mcast_msfilter(sk, optval, optlen); break; case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; case IP_LOCAL_PORT_RANGE: { const __u16 lo = val; const __u16 hi = val >> 16; if (optlen != sizeof(__u32)) goto e_inval; if (lo != 0 && hi != 0 && lo > hi) goto e_inval; inet->local_port_range.lo = lo; inet->local_port_range.hi = hi; break; } default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; e_inval: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return -EINVAL; } /** * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * * To support IP_CMSG_PKTINFO option, we store rt_iif and specific * destination in skb->cb[] before dst drop. * This way, receiver doesn't make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); bool prepare = inet_test_bit(PKTINFO, sk) || ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { /* skb->cb is overloaded: prior to this point it is IP{6}CB * which has interface index (iif) as the first member of the * underlying inet{6}_skb_parm struct. This code then overlays * PKTINFO_SKB_CB and in_pktinfo also has iif as the first * element so the iif is picked up from the prior IPCB. If iif * is the loopback interface, then return the sending interface * (e.g., process binds socket to eth0 for Tx which is * redirected to loopback in the rtable/dst). */ struct rtable *rt = skb_rtable(skb); bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) pktinfo->ipi_ifindex = inet_iif(skb); else if (l3slave && rt && rt->rt_iif) pktinfo->ipi_ifindex = rt->rt_iif; pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; } skb_dst_drop(skb); } int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level != SOL_IP) return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); #endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY && !ip_mroute_opt(optname)) err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ip_setsockopt); /* * Get the options. Note for future reference. The GET of IP options gets * the _received_ ones. The set sets the _sent_ ones. */ static bool getsockopt_needs_rtnl(int optname) { switch (optname) { case IP_MSFILTER: case MCAST_MSFILTER: return true; } return false; } static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num, gsf_size; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; num = gsf.gf_numsrc; err = ip_mc_gsfget(sk, &gsf, optval, offsetof(struct group_filter, gf_slist_flex)); if (err) return err; if (gsf.gf_numsrc < num) num = gsf.gf_numsrc; gsf_size = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) return -EFAULT; return 0; } static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; err = ip_mc_gsfget(sk, &gf, optval, offsetof(struct compat_group_filter, gf_slist_flex)); if (err) return err; if (gf.gf_numsrc < num) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf.gf_numsrc))) return -EFAULT; return 0; } int do_ip_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); int val, err = 0; int len; if (level != SOL_IP) return -EOPNOTSUPP; if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; /* Handle options that can be read without locking the socket. */ switch (optname) { case IP_PKTINFO: val = inet_test_bit(PKTINFO, sk); goto copyval; case IP_RECVTTL: val = inet_test_bit(TTL, sk); goto copyval; case IP_RECVTOS: val = inet_test_bit(TOS, sk); goto copyval; case IP_RECVOPTS: val = inet_test_bit(RECVOPTS, sk); goto copyval; case IP_RETOPTS: val = inet_test_bit(RETOPTS, sk); goto copyval; case IP_PASSSEC: val = inet_test_bit(PASSSEC, sk); goto copyval; case IP_RECVORIGDSTADDR: val = inet_test_bit(ORIGDSTADDR, sk); goto copyval; case IP_CHECKSUM: val = inet_test_bit(CHECKSUM, sk); goto copyval; case IP_RECVFRAGSIZE: val = inet_test_bit(RECVFRAGSIZE, sk); goto copyval; case IP_RECVERR: val = inet_test_bit(RECVERR, sk); goto copyval; case IP_RECVERR_RFC4884: val = inet_test_bit(RECVERR_RFC4884, sk); goto copyval; case IP_FREEBIND: val = inet_test_bit(FREEBIND, sk); goto copyval; case IP_HDRINCL: val = inet_test_bit(HDRINCL, sk); goto copyval; case IP_MULTICAST_LOOP: val = inet_test_bit(MC_LOOP, sk); goto copyval; case IP_MULTICAST_ALL: val = inet_test_bit(MC_ALL, sk); goto copyval; case IP_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); goto copyval; case IP_NODEFRAG: val = inet_test_bit(NODEFRAG, sk); goto copyval; case IP_BIND_ADDRESS_NO_PORT: val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); goto copyval; case IP_TTL: val = READ_ONCE(inet->uc_ttl); if (val < 0) val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl); goto copyval; case IP_MINTTL: val = READ_ONCE(inet->min_ttl); goto copyval; } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: { unsigned char optbuf[sizeof(struct ip_options)+40]; struct ip_options *opt = (struct ip_options *)optbuf; struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); opt->optlen = 0; if (inet_opt) memcpy(optbuf, &inet_opt->opt, sizeof(struct ip_options) + inet_opt->opt.optlen); sockopt_release_sock(sk); if (opt->optlen == 0) { len = 0; return copy_to_sockptr(optlen, &len, sizeof(int)); } ip_options_undo(opt); len = min_t(unsigned int, len, opt->optlen); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, opt->__data, len)) return -EFAULT; return 0; } case IP_TOS: val = inet->tos; break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; case IP_MTU: { struct dst_entry *dst; val = 0; dst = sk_dst_get(sk); if (dst) { val = dst_mtu(dst); dst_release(dst); } if (!val) { sockopt_release_sock(sk); return -ENOTCONN; } break; } case IP_MULTICAST_TTL: val = inet->mc_ttl; break; case IP_UNICAST_IF: val = (__force int)htonl((__u32) inet->uc_index); break; case IP_MULTICAST_IF: { struct in_addr addr; len = min_t(unsigned int, len, sizeof(struct in_addr)); addr.s_addr = inet->mc_addr; sockopt_release_sock(sk); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &addr, len)) return -EFAULT; return 0; } case IP_MSFILTER: { struct ip_msfilter msf; if (len < IP_MSFILTER_SIZE(0)) { err = -EINVAL; goto out; } if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) { err = -EFAULT; goto out; } err = ip_mc_msfget(sk, &msf, optval, optlen); goto out; } case MCAST_MSFILTER: if (in_compat_syscall()) err = compat_ip_get_mcast_msfilter(sk, optval, optlen, len); else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; case IP_PKTOPTIONS: { struct msghdr msg; sockopt_release_sock(sk); if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; if (inet_test_bit(PKTINFO, sk)) { struct in_pktinfo info; info.ipi_addr.s_addr = inet->inet_rcv_saddr; info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr; info.ipi_ifindex = inet->mc_index; put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } if (inet_test_bit(TTL, sk)) { int hlim = inet->mc_ttl; put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); } if (inet_test_bit(TOS, sk)) { int tos = inet->rcv_tos; put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IP_LOCAL_PORT_RANGE: val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; break; case IP_PROTOCOL: val = inet_sk(sk)->inet_num; break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; } sockopt_release_sock(sk); copyval: if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; } return 0; out: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; } int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); #endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && !ip_mroute_opt(optname)) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); return err; } #endif return err; } EXPORT_SYMBOL(ip_getsockopt);
12 12 12 101 101 101 101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo) { u8 add_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); if (add_addr & (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { MPTCP_INC_STATS(sock_net((struct sock *)msk), echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP); return -EINVAL; } if (echo) { msk->pm.remote = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); } else { msk->pm.local = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); } WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); if (rm_addr) { MPTCP_ADD_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRTXDROP, rm_list->nr); return -EINVAL; } msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); mptcp_pm_nl_addr_send_ack(msk); return 0; } int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, rm_list); spin_unlock_bh(&msk->pm.lock); return 0; } /* path manager event handlers */ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); } bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; unsigned int subflows_max; int ret = 0; if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); pm->subflows++; spin_unlock_bh(&pm->lock); return true; } return false; } subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, subflows_max, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) return false; spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { ret = pm->subflows < subflows_max; if (ret && ++pm->subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); return ret; } /* return true if the new status bit is currently cleared, that is, this event * can be server, eventually by an already scheduled work */ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, enum mptcp_pm_status new_status) { pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, BIT(new_status)); if (msk->pm.status & BIT(new_status)) return false; msk->pm.status |= BIT(new_status); mptcp_schedule_work((struct sock *)msk); return true; } void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) { struct mptcp_pm_data *pm = &msk->pm; bool announce = false; pr_debug("msk=%p", msk); spin_lock_bh(&pm->lock); /* mptcp_pm_fully_established() can be invoked by multiple * racing paths - accept() and check_fully_established() * be sure to serve this event only once. */ if (READ_ONCE(pm->work_pending) && !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) announce = true; msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); if (announce) mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) { pr_debug("msk=%p", msk); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p", msk); if (!READ_ONCE(pm->work_pending)) return; spin_lock_bh(&pm->lock); if (READ_ONCE(pm->work_pending)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, const struct mptcp_subflow_context *subflow) { struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; update_subflows = subflow->request_join || subflow->mp_join; if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); pm->subflows--; spin_unlock_bh(&pm->lock); } return; } if (!READ_ONCE(pm->work_pending) && !update_subflows) return; spin_lock_bh(&pm->lock); if (update_subflows) __mptcp_pm_close_subflow(msk); /* Even if this subflow is not really established, tell the PM to try * to pick the next ones, if possible. */ if (mptcp_pm_nl_check_work_pending(msk)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, READ_ONCE(pm->accept_addr)); mptcp_event_addr_announced(ssk, addr); spin_lock_bh(&pm->lock); if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } } else if (!READ_ONCE(pm->accept_addr)) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p", msk); spin_lock_bh(&pm->lock); if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) { if (!mptcp_pm_should_add_signal(msk)) return; mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { struct mptcp_pm_data *pm = &msk->pm; u8 i; pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr); for (i = 0; i < rm_list->nr; i++) mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) pm->rm_list_rx = *rm_list; else __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); msk = mptcp_sk(sk); if (subflow->backup != bkup) subflow->backup = bkup; mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); pr_debug("fail_seq=%llu", fail_seq); if (!READ_ONCE(msk->allow_infinite_fallback)) return; if (!subflow->fail_tout) { pr_debug("send MP_FAIL response and infinite map"); subflow->send_mp_fail = 1; subflow->send_infinite_map = 1; tcp_send_ack(sk); } else { pr_debug("MP_FAIL response received"); WRITE_ONCE(subflow->fail_tout, 0); } } /* path manager helpers */ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions) { int ret = false; u8 add_addr; u8 family; bool port; spin_lock_bh(&msk->pm.lock); /* double check after the lock is acquired */ if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; /* always drop every other options for pure ack ADD_ADDR; this is a * plain dup-ack from TCP perspective. The other MPTCP-relevant info, * if any, will be carried by the 'original' TCP ack */ if (skb && skb_is_tcp_pure_ack(skb)) { remaining += opt_size; *drop_other_suboptions = true; } *echo = mptcp_pm_should_add_signal_echo(msk); port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); family = *echo ? msk->pm.remote.family : msk->pm.local.family; if (remaining < mptcp_add_addr_len(family, *echo, port)) goto out_unlock; if (*echo) { *addr = msk->pm.remote; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); } else { *addr = msk->pm.local; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); } WRITE_ONCE(msk->pm.addr_signal, add_addr); ret = true; out_unlock: spin_unlock_bh(&msk->pm.lock); return ret; } bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list) { int ret = false, len; u8 rm_addr; spin_lock_bh(&msk->pm.lock); /* double check after the lock is acquired */ if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); if (len < 0) { WRITE_ONCE(msk->pm.addr_signal, rm_addr); goto out_unlock; } if (remaining < len) goto out_unlock; *rm_list = msk->pm.rm_list_tx; WRITE_ONCE(msk->pm.addr_signal, rm_addr); ret = true; out_unlock: spin_unlock_bh(&msk->pm.lock); return ret; } int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { struct mptcp_addr_info skc_local; struct mptcp_addr_info msk_local; if (WARN_ON_ONCE(!msk)) return -1; /* The 0 ID mapping is defined by the first subflow, copied into the msk * addr */ mptcp_local_address((struct sock_common *)msk, &msk_local); mptcp_local_address((struct sock_common *)skc, &skc_local); if (mptcp_addresses_equal(&msk_local, &skc_local, false)) return 0; if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_local_id(msk, &skc_local); return mptcp_pm_nl_get_local_id(msk, &skc_local); } int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { *flags = 0; *ifindex = 0; if (!id) return 0; if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); } int mptcp_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup) { if (token) return mptcp_userspace_pm_set_flags(net, token, loc, rem, bkup); return mptcp_pm_nl_set_flags(net, loc, bkup); } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); /* keep track of rtx periods with no progress */ if (!subflow->stale_count) { subflow->stale_rcv_tstamp = rcv_tstamp; subflow->stale_count++; } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; mptcp_pm_nl_subflow_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); } } /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, * otherwise allow any matching local/remote pair */ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem) { bool mptcp_is_v4 = sk->sk_family == AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); if (mptcp_is_v4) return loc_is_v4 && rem_is_v4; if (ipv6_only_sock(sk)) return !loc_is_v4 && !rem_is_v4; return loc_is_v4 == rem_is_v4; #else return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; #endif } void mptcp_pm_data_reset(struct mptcp_sock *msk) { u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); struct mptcp_pm_data *pm = &msk->pm; pm->add_addr_signaled = 0; pm->add_addr_accepted = 0; pm->local_addr_used = 0; pm->subflows = 0; pm->rm_list_tx.nr = 0; pm->rm_list_rx.nr = 0; WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows_allowed) || !!mptcp_pm_get_add_addr_signal_max(msk)); WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); } else { WRITE_ONCE(pm->work_pending, 0); WRITE_ONCE(pm->accept_addr, 0); WRITE_ONCE(pm->accept_subflow, 0); } WRITE_ONCE(pm->addr_signal, 0); WRITE_ONCE(pm->remote_deny_join_id0, false); pm->status = 0; bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); } void mptcp_pm_data_init(struct mptcp_sock *msk) { spin_lock_init(&msk->pm.lock); INIT_LIST_HEAD(&msk->pm.anno_list); INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); mptcp_pm_data_reset(msk); } void __init mptcp_pm_init(void) { mptcp_pm_nl_init(); }
19864 19856 19868 19865 19877 19867 19860 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> #include <linux/bug.h> #include <linux/nmi.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", [ ESTACK_VC ] = "#VC", [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_TASK) return "TASK"; if (type == STACK_TYPE_IRQ) return "IRQ"; if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we * use for all the kernel entry points, including * SYSENTER. */ return "ENTRY_TRAMPOLINE"; } if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return NULL; } /** * struct estack_pages - Page descriptor for exception stacks * @offs: Offset from the start of the exception stack area * @size: Size of the exception stack * @type: Type to store in the stack_info struct */ struct estack_pages { u32 offs; u16 size; u16 type; }; #define EPAGERANGE(st) \ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ .offs = CEA_ESTACK_OFFS(st), \ .size = CEA_ESTACK_SIZE(st), \ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } /* * Array of exception stack page descriptors. If the stack is larger than * PAGE_SIZE, all pages covering a particular stack will have the same * info. The guard pages including the not mapped DB2 stack are zeroed * out. */ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), EPAGERANGE(DB), EPAGERANGE(MCE), EPAGERANGE(VC), EPAGERANGE(VC2), }; static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* * Handle the case where stack trace is collected _before_ * cea_exception_stacks had been initialized. */ if (!begin) return false; end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) return false; /* Calc page offset from start of exception stacks */ k = (stk - begin) >> PAGE_SHIFT; /* Lookup the page descriptor */ ep = &estack_pages[k]; /* Guard page? */ if (!ep->size) return false; begin += (unsigned long)ep->offs; end = begin + (unsigned long)ep->size; regs = (struct pt_regs *)end - 1; info->type = ep->type; info->begin = (unsigned long *)begin; info->end = (unsigned long *)end; info->next_sp = (unsigned long *)regs->sp; return true; } static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* * @end points directly to the top most stack entry to avoid a -8 * adjustment in the stack switch hotpath. Adjust it back before * calculating @begin. */ end++; begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * Due to the switching logic RSP can never be == @end because the * final operation is 'popq %rsp' which means after that RSP points * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; info->begin = begin; info->end = end; /* * The next stack pointer is stored at the top of the irq stack * before switching to the irq stack. Actual stack entries are all * below that. */ info->next_sp = (unsigned long *)*(end - 1); return true; } bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info) { if (in_task_stack(stack, task, info)) return true; if (task != current) return false; if (in_exception_stack(stack, info)) return true; if (in_irq_stack(stack, info)) return true; if (in_entry_stack(stack, info)) return true; return false; } int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { task = task ? : current; if (!stack) goto unknown; if (!get_stack_info_noinstr(stack, task, info)) goto unknown; /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: * just break out and report an unknown stack type. */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { if (task == current) printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; } return 0; unknown: info->type = STACK_TYPE_UNKNOWN; return -EINVAL; }
10 10 10 10 10 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 // SPDX-License-Identifier: GPL-2.0-only /* * IBSS mode implementation * Copyright 2003-2008, Jouni Malinen <j@w1.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH * Copyright(c) 2018-2023 Intel Corporation */ #include <linux/delay.h> #include <linux/slab.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #define IEEE80211_SCAN_INTERVAL (2 * HZ) #define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) #define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) #define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) #define IEEE80211_IBSS_RSN_INACTIVITY_LIMIT (10 * HZ) #define IEEE80211_IBSS_MAX_STA_ENTRIES 128 static struct beacon_data * ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, const int beacon_int, const u32 basic_rates, const u16 capability, u64 tsf, struct cfg80211_chan_def *chandef, bool *have_higher_than_11mbit, struct cfg80211_csa_settings *csa_settings) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; int rates_n = 0, i, ri; struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; u32 rate_flags, rates = 0, rates_added = 0; struct beacon_data *presp; int frame_len; int shift; /* Build IBSS probe response */ frame_len = sizeof(struct ieee80211_hdr_3addr) + 12 /* struct ieee80211_mgmt.u.beacon */ + 2 + IEEE80211_MAX_SSID_LEN /* max SSID */ + 2 + 8 /* max Supported Rates */ + 3 /* max DS params */ + 4 /* IBSS params */ + 5 /* Channel Switch Announcement */ + 2 + (IEEE80211_MAX_SUPP_RATES - 8) + 2 + sizeof(struct ieee80211_ht_cap) + 2 + sizeof(struct ieee80211_ht_operation) + 2 + sizeof(struct ieee80211_vht_cap) + 2 + sizeof(struct ieee80211_vht_operation) + ifibss->ie_len; presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL); if (!presp) return NULL; presp->head = (void *)(presp + 1); mgmt = (void *) presp->head; mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); mgmt->u.beacon.beacon_int = cpu_to_le16(beacon_int); mgmt->u.beacon.timestamp = cpu_to_le64(tsf); mgmt->u.beacon.capab_info = cpu_to_le16(capability); pos = (u8 *)mgmt + offsetof(struct ieee80211_mgmt, u.beacon.variable); *pos++ = WLAN_EID_SSID; *pos++ = ifibss->ssid_len; memcpy(pos, ifibss->ssid, ifibss->ssid_len); pos += ifibss->ssid_len; sband = local->hw.wiphy->bands[chandef->chan->band]; rate_flags = ieee80211_chandef_rate_flags(chandef); shift = ieee80211_chandef_get_shift(chandef); rates_n = 0; if (have_higher_than_11mbit) *have_higher_than_11mbit = false; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (sband->bitrates[i].bitrate > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; rates |= BIT(i); rates_n++; } *pos++ = WLAN_EID_SUPP_RATES; *pos++ = min_t(int, 8, rates_n); for (ri = 0; ri < sband->n_bitrates; ri++) { int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate, 5 * (1 << shift)); u8 basic = 0; if (!(rates & BIT(ri))) continue; if (basic_rates & BIT(ri)) basic = 0x80; *pos++ = basic | (u8) rate; if (++rates_added == 8) { ri++; /* continue at next rate for EXT_SUPP_RATES */ break; } } if (sband->band == NL80211_BAND_2GHZ) { *pos++ = WLAN_EID_DS_PARAMS; *pos++ = 1; *pos++ = ieee80211_frequency_to_channel( chandef->chan->center_freq); } *pos++ = WLAN_EID_IBSS_PARAMS; *pos++ = 2; /* FIX: set ATIM window based on scan results */ *pos++ = 0; *pos++ = 0; if (csa_settings) { *pos++ = WLAN_EID_CHANNEL_SWITCH; *pos++ = 3; *pos++ = csa_settings->block_tx ? 1 : 0; *pos++ = ieee80211_frequency_to_channel( csa_settings->chandef.chan->center_freq); presp->cntdwn_counter_offsets[0] = (pos - presp->head); *pos++ = csa_settings->count; presp->cntdwn_current_counter = csa_settings->count; } /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */ if (rates_n > 8) { *pos++ = WLAN_EID_EXT_SUPP_RATES; *pos++ = rates_n - 8; for (; ri < sband->n_bitrates; ri++) { int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate, 5 * (1 << shift)); u8 basic = 0; if (!(rates & BIT(ri))) continue; if (basic_rates & BIT(ri)) basic = 0x80; *pos++ = basic | (u8) rate; } } if (ifibss->ie_len) { memcpy(pos, ifibss->ie, ifibss->ie_len); pos += ifibss->ie_len; } /* add HT capability and information IEs */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_5 && chandef->width != NL80211_CHAN_WIDTH_10 && sband->ht_cap.ht_supported) { struct ieee80211_sta_ht_cap ht_cap; memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); ieee80211_apply_htcap_overrides(sdata, &ht_cap); pos = ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); /* * Note: According to 802.11n-2009 9.13.3.1, HT Protection * field and RIFS Mode are reserved in IBSS mode, therefore * keep them at 0 */ pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap, chandef, 0, false); /* add VHT capability and information IEs */ if (chandef->width != NL80211_CHAN_WIDTH_20 && chandef->width != NL80211_CHAN_WIDTH_40 && sband->vht_cap.vht_supported) { pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); pos = ieee80211_ie_build_vht_oper(pos, &sband->vht_cap, chandef); } } if (local->hw.queues >= IEEE80211_NUM_ACS) pos = ieee80211_add_wmm_info_ie(pos, 0); /* U-APSD not in use */ presp->head_len = pos - presp->head; if (WARN_ON(presp->head_len > frame_len)) goto error; return presp; error: kfree(presp); return NULL; } static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const int beacon_int, struct cfg80211_chan_def *req_chandef, const u32 basic_rates, const u16 capability, u64 tsf, bool creator) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; u64 bss_change; struct cfg80211_chan_def chandef; struct ieee80211_channel *chan; struct beacon_data *presp; struct cfg80211_inform_bss bss_meta = {}; bool have_higher_than_11mbit; bool radar_required; int err; sdata_assert_lock(sdata); /* Reset own TSF to allow time synchronization work. */ drv_reset_tsf(local, sdata); if (!ether_addr_equal(ifibss->bssid, bssid)) sta_info_flush(sdata); /* if merging, indicate to driver that we leave the old IBSS */ if (sdata->vif.cfg.ibss_joined) { sdata->vif.cfg.ibss_joined = false; sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; netif_carrier_off(sdata->dev); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IBSS | BSS_CHANGED_BEACON_ENABLED); drv_leave_ibss(local, sdata); } presp = sdata_dereference(ifibss->presp, sdata); RCU_INIT_POINTER(ifibss->presp, NULL); if (presp) kfree_rcu(presp, rcu_head); /* make a copy of the chandef, it could be modified below. */ chandef = *req_chandef; chan = chandef.chan; if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef, NL80211_IFTYPE_ADHOC)) { if (chandef.width == NL80211_CHAN_WIDTH_5 || chandef.width == NL80211_CHAN_WIDTH_10 || chandef.width == NL80211_CHAN_WIDTH_20_NOHT || chandef.width == NL80211_CHAN_WIDTH_20) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); return; } chandef.width = NL80211_CHAN_WIDTH_20; chandef.center_freq1 = chan->center_freq; /* check again for downgraded chandef */ if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef, NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); return; } } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, &chandef, NL80211_IFTYPE_ADHOC); if (err < 0) { sdata_info(sdata, "Failed to join IBSS, invalid chandef\n"); return; } if (err > 0 && !ifibss->userspace_handles_dfs) { sdata_info(sdata, "Failed to join IBSS, DFS channel without control program\n"); return; } radar_required = err; mutex_lock(&local->mtx); if (ieee80211_link_use_channel(&sdata->deflink, &chandef, ifibss->fixed_channel ? IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE)) { sdata_info(sdata, "Failed to join IBSS, no channel context\n"); mutex_unlock(&local->mtx); return; } sdata->deflink.radar_required = radar_required; mutex_unlock(&local->mtx); memcpy(ifibss->bssid, bssid, ETH_ALEN); presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates, capability, tsf, &chandef, &have_higher_than_11mbit, NULL); if (!presp) return; rcu_assign_pointer(ifibss->presp, presp); mgmt = (void *)presp->head; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.beacon_int = beacon_int; sdata->vif.bss_conf.basic_rates = basic_rates; sdata->vif.cfg.ssid_len = ifibss->ssid_len; memcpy(sdata->vif.cfg.ssid, ifibss->ssid, ifibss->ssid_len); bss_change = BSS_CHANGED_BEACON_INT; bss_change |= ieee80211_reset_erp_info(sdata); bss_change |= BSS_CHANGED_BSSID; bss_change |= BSS_CHANGED_BEACON; bss_change |= BSS_CHANGED_BEACON_ENABLED; bss_change |= BSS_CHANGED_BASIC_RATES; bss_change |= BSS_CHANGED_HT; bss_change |= BSS_CHANGED_IBSS; bss_change |= BSS_CHANGED_SSID; /* * In 5 GHz/802.11a, we can always use short slot time. * (IEEE 802.11-2012 18.3.8.7) * * In 2.4GHz, we must always use long slots in IBSS for compatibility * reasons. * (IEEE 802.11-2012 19.4.5) * * HT follows these specifications (IEEE 802.11-2012 20.3.18) */ sdata->vif.bss_conf.use_short_slot = chan->band == NL80211_BAND_5GHZ; bss_change |= BSS_CHANGED_ERP_SLOT; /* cf. IEEE 802.11 9.2.12 */ sdata->deflink.operating_11g_mode = chan->band == NL80211_BAND_2GHZ && have_higher_than_11mbit; ieee80211_set_wmm_default(&sdata->deflink, true, false); sdata->vif.cfg.ibss_joined = true; sdata->vif.cfg.ibss_creator = creator; err = drv_join_ibss(local, sdata); if (err) { sdata->vif.cfg.ibss_joined = false; sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; sdata->vif.cfg.ssid_len = 0; RCU_INIT_POINTER(ifibss->presp, NULL); kfree_rcu(presp, rcu_head); mutex_lock(&local->mtx); ieee80211_link_release_channel(&sdata->deflink); mutex_unlock(&local->mtx); sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n", err); return; } ieee80211_bss_info_change_notify(sdata, bss_change); ifibss->state = IEEE80211_IBSS_MLME_JOINED; mod_timer(&ifibss->timer, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); bss_meta.chan = chan; bss_meta.scan_width = cfg80211_chandef_to_scan_width(&chandef); bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, presp->head_len, GFP_KERNEL); cfg80211_put_bss(local->hw.wiphy, bss); netif_carrier_on(sdata->dev); cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL); } static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss *bss) { struct cfg80211_bss *cbss = container_of((void *)bss, struct cfg80211_bss, priv); struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; u32 basic_rates; int i, j; u16 beacon_int = cbss->beacon_interval; const struct cfg80211_bss_ies *ies; enum nl80211_channel_type chan_type; u64 tsf; u32 rate_flags; int shift; sdata_assert_lock(sdata); if (beacon_int < 10) beacon_int = 10; switch (sdata->u.ibss.chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: chan_type = cfg80211_get_chandef_type(&sdata->u.ibss.chandef); cfg80211_chandef_create(&chandef, cbss->channel, chan_type); break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: cfg80211_chandef_create(&chandef, cbss->channel, NL80211_CHAN_NO_HT); chandef.width = sdata->u.ibss.chandef.width; break; case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: chandef = sdata->u.ibss.chandef; chandef.chan = cbss->channel; break; default: /* fall back to 20 MHz for unsupported modes */ cfg80211_chandef_create(&chandef, cbss->channel, NL80211_CHAN_NO_HT); break; } sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef); shift = ieee80211_vif_get_shift(&sdata->vif); basic_rates = 0; for (i = 0; i < bss->supp_rates_len; i++) { int rate = bss->supp_rates[i] & 0x7f; bool is_basic = !!(bss->supp_rates[i] & 0x80); for (j = 0; j < sband->n_bitrates; j++) { int brate; if ((rate_flags & sband->bitrates[j].flags) != rate_flags) continue; brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5 * (1 << shift)); if (brate == rate) { if (is_basic) basic_rates |= BIT(j); break; } } } rcu_read_lock(); ies = rcu_dereference(cbss->ies); tsf = ies->tsf; rcu_read_unlock(); __ieee80211_sta_join_ibss(sdata, cbss->bssid, beacon_int, &chandef, basic_rates, cbss->capability, tsf, false); } int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings, u64 *changed) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct beacon_data *presp, *old_presp; struct cfg80211_bss *cbss; const struct cfg80211_bss_ies *ies; u16 capability = WLAN_CAPABILITY_IBSS; u64 tsf; sdata_assert_lock(sdata); if (ifibss->privacy) capability |= WLAN_CAPABILITY_PRIVACY; cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); if (WARN_ON(!cbss)) return -EINVAL; rcu_read_lock(); ies = rcu_dereference(cbss->ies); tsf = ies->tsf; rcu_read_unlock(); cfg80211_put_bss(sdata->local->hw.wiphy, cbss); old_presp = sdata_dereference(ifibss->presp, sdata); presp = ieee80211_ibss_build_presp(sdata, sdata->vif.bss_conf.beacon_int, sdata->vif.bss_conf.basic_rates, capability, tsf, &ifibss->chandef, NULL, csa_settings); if (!presp) return -ENOMEM; rcu_assign_pointer(ifibss->presp, presp); if (old_presp) kfree_rcu(old_presp, rcu_head); *changed |= BSS_CHANGED_BEACON; return 0; } int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; sdata_assert_lock(sdata); /* When not connected/joined, sending CSA doesn't make sense. */ if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) return -ENOLINK; /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); /* XXX: should not really modify cfg80211 data */ if (cbss) { cbss->channel = sdata->deflink.csa_chandef.chan; cfg80211_put_bss(sdata->local->hw.wiphy, cbss); } } ifibss->chandef = sdata->deflink.csa_chandef; /* generate the beacon */ return ieee80211_ibss_csa_beacon(sdata, NULL, changed); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; wiphy_work_cancel(sdata->local->hw.wiphy, &ifibss->csa_connection_drop_work); } static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta) __acquires(RCU) { struct ieee80211_sub_if_data *sdata = sta->sdata; u8 addr[ETH_ALEN]; memcpy(addr, sta->sta.addr, ETH_ALEN); ibss_dbg(sdata, "Adding new IBSS station %pM\n", addr); sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); /* authorize the station only if the network is not RSN protected. If * not wait for the userspace to authorize it */ if (!sta->sdata->u.ibss.control_port) sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); rate_control_rate_init(sta); /* If it fails, maybe we raced another insertion? */ if (sta_info_insert_rcu(sta)) return sta_info_get(sdata, addr); return sta; } static struct sta_info * ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates) __acquires(RCU) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; enum nl80211_bss_scan_width scan_width; int band; /* * XXX: Consider removing the least recently used entry and * allow new one to be added. */ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n", sdata->name, addr); rcu_read_lock(); return NULL; } if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) { rcu_read_lock(); return NULL; } if (!ether_addr_equal(bssid, sdata->u.ibss.bssid)) { rcu_read_lock(); return NULL; } rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON_ONCE(!chanctx_conf)) return NULL; band = chanctx_conf->def.chan->band; scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_KERNEL); if (!sta) { rcu_read_lock(); return NULL; } /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband, scan_width); return ieee80211_ibss_finish_sta(sta); } static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; int active = 0; struct sta_info *sta; sdata_assert_lock(sdata); rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sta->sdata == sdata && time_is_after_jiffies(last_active + IEEE80211_IBSS_MERGE_INTERVAL)) { active++; break; } } rcu_read_unlock(); return active; } static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); if (cbss) { cfg80211_unlink_bss(local->hw.wiphy, cbss); cfg80211_put_bss(sdata->local->hw.wiphy, cbss); } } ifibss->state = IEEE80211_IBSS_MLME_SEARCH; sta_info_flush(sdata); spin_lock_bh(&ifibss->incomplete_lock); while (!list_empty(&ifibss->incomplete_stations)) { sta = list_first_entry(&ifibss->incomplete_stations, struct sta_info, list); list_del(&sta->list); spin_unlock_bh(&ifibss->incomplete_lock); sta_info_free(local, sta); spin_lock_bh(&ifibss->incomplete_lock); } spin_unlock_bh(&ifibss->incomplete_lock); netif_carrier_off(sdata->dev); sdata->vif.cfg.ibss_joined = false; sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; sdata->vif.cfg.ssid_len = 0; /* remove beacon */ presp = sdata_dereference(ifibss->presp, sdata); RCU_INIT_POINTER(sdata->u.ibss.presp, NULL); if (presp) kfree_rcu(presp, rcu_head); clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_IBSS); drv_leave_ibss(local, sdata); mutex_lock(&local->mtx); ieee80211_link_release_channel(&sdata->deflink); mutex_unlock(&local->mtx); } static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.ibss.csa_connection_drop_work); sdata_lock(sdata); ieee80211_ibss_disconnect(sdata); synchronize_rcu(); skb_queue_purge(&sdata->skb_queue); /* trigger a scan to find another IBSS network to join */ wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); sdata_unlock(sdata); } static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; int err; /* if the current channel is a DFS channel, mark the channel as * unavailable. */ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, &ifibss->chandef, NL80211_IFTYPE_ADHOC); if (err > 0) cfg80211_radar_event(sdata->local->hw.wiphy, &ifibss->chandef, GFP_ATOMIC); } static bool ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, bool beacon) { struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; enum nl80211_channel_type ch_type; int err; ieee80211_conn_flags_t conn_flags; u32 vht_cap_info = 0; sdata_assert_lock(sdata); conn_flags = IEEE80211_CONN_DISABLE_VHT; switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: conn_flags |= IEEE80211_CONN_DISABLE_HT; fallthrough; case NL80211_CHAN_WIDTH_20: conn_flags |= IEEE80211_CONN_DISABLE_40MHZ; break; default: break; } if (elems->vht_cap_elem) vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); memset(&params, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, vht_cap_info, conn_flags, ifibss->bssid, &csa_ie); /* can't switch to destination channel, fail */ if (err < 0) goto disconnect; /* did not contain a CSA */ if (err) return false; /* channel switch is not supported, disconnect */ if (!(sdata->local->hw.wiphy->flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)) goto disconnect; params.count = csa_ie.count; params.chandef = csa_ie.chandef; switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: /* keep our current HT mode (HT20/HT40+/HT40-), even if * another mode has been announced. The mode is not adopted * within the beacon while doing CSA and we should therefore * keep the mode which we announce. */ ch_type = cfg80211_get_chandef_type(&ifibss->chandef); cfg80211_chandef_create(&params.chandef, params.chandef.chan, ch_type); break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: if (params.chandef.width != ifibss->chandef.width) { sdata_info(sdata, "IBSS %pM received channel switch from incompatible channel width (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", ifibss->bssid, params.chandef.chan->center_freq, params.chandef.width, params.chandef.center_freq1, params.chandef.center_freq2); goto disconnect; } break; default: /* should not happen, conn_flags should prevent VHT modes. */ WARN_ON(1); goto disconnect; } if (!cfg80211_reg_can_beacon(sdata->local->hw.wiphy, &params.chandef, NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "IBSS %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", ifibss->bssid, params.chandef.chan->center_freq, params.chandef.width, params.chandef.center_freq1, params.chandef.center_freq2); goto disconnect; } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, &params.chandef, NL80211_IFTYPE_ADHOC); if (err < 0) goto disconnect; if (err > 0 && !ifibss->userspace_handles_dfs) { /* IBSS-DFS only allowed with a control program */ goto disconnect; } params.radar_required = err; if (cfg80211_chandef_identical(&params.chandef, &sdata->vif.bss_conf.chandef)) { ibss_dbg(sdata, "received csa with an identical chandef, ignoring\n"); return true; } /* all checks done, now perform the channel switch. */ ibss_dbg(sdata, "received channel switch announcement to go to channel %d MHz\n", params.chandef.chan->center_freq); params.block_tx = !!csa_ie.mode; if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev, &params)) goto disconnect; ieee80211_ibss_csa_mark_radar(sdata); return true; disconnect: ibss_dbg(sdata, "Can't handle channel switch, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifibss->csa_connection_drop_work); ieee80211_ibss_csa_mark_radar(sdata); return true; } static void ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems) { int required_len; if (len < IEEE80211_MIN_ACTION_SIZE + 1) return; /* CSA is the only action we handle for now */ if (mgmt->u.action.u.measurement.action_code != WLAN_ACTION_SPCT_CHL_SWITCH) return; required_len = IEEE80211_MIN_ACTION_SIZE + sizeof(mgmt->u.action.u.chan_switch); if (len < required_len) return; if (!sdata->vif.bss_conf.csa_active) ieee80211_ibss_process_chanswitch(sdata, elems, false); } static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u16 reason = le16_to_cpu(mgmt->u.deauth.reason_code); if (len < IEEE80211_DEAUTH_FRAME_LEN) return; ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); ibss_dbg(sdata, "\tBSSID=%pM (reason: %d)\n", mgmt->bssid, reason); sta_info_destroy_addr(sdata, mgmt->sa); } static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u16 auth_alg, auth_transaction; sdata_assert_lock(sdata); if (len < 24 + 6) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); ibss_dbg(sdata, "RX Auth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); ibss_dbg(sdata, "\tBSSID=%pM (auth_transaction=%d)\n", mgmt->bssid, auth_transaction); if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1) return; /* * IEEE 802.11 standard does not require authentication in IBSS * networks and most implementations do not seem to use it. * However, try to reply to authentication attempts if someone * has actually implemented this. */ ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, 0, NULL, 0, mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0, 0); } static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems, struct ieee80211_channel *channel) { struct sta_info *sta; enum nl80211_band band = rx_status->band; enum nl80211_bss_scan_width scan_width; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; bool rates_updated = false; u32 supp_rates = 0; if (sdata->vif.type != NL80211_IFTYPE_ADHOC) return; if (!ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid)) return; sband = local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return; rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); if (elems->supp_rates) { supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL); if (sta) { u32 prev_rates; prev_rates = sta->sta.deflink.supp_rates[band]; /* make sure mandatory rates are always added */ scan_width = NL80211_BSS_CHAN_WIDTH_20; if (rx_status->bw == RATE_INFO_BW_5) scan_width = NL80211_BSS_CHAN_WIDTH_5; else if (rx_status->bw == RATE_INFO_BW_10) scan_width = NL80211_BSS_CHAN_WIDTH_10; sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband, scan_width); if (sta->sta.deflink.supp_rates[band] != prev_rates) { ibss_dbg(sdata, "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", sta->sta.addr, prev_rates, sta->sta.deflink.supp_rates[band]); rates_updated = true; } } else { rcu_read_unlock(); sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); } } if (sta && !sta->sta.wme && (elems->wmm_info || elems->s1g_capab) && local->hw.queues >= IEEE80211_NUM_ACS) { sta->sta.wme = true; ieee80211_check_fast_xmit(sta); } if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_5 && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_10) { /* we both use HT */ struct ieee80211_ht_cap htcap_ie; struct cfg80211_chan_def chandef; enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth; cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(elems->ht_operation, &chandef); memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, &htcap_ie, &sta->deflink); if (elems->vht_operation && elems->vht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20 && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_40) { /* we both use VHT */ struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.deflink.vht_cap; u32 vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); ieee80211_chandef_vht_oper(&local->hw, vht_cap_info, elems->vht_operation, elems->ht_operation, &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, &cap_ie, NULL, &sta->deflink); if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap))) rates_updated |= true; } if (bw != sta->sta.deflink.bandwidth) rates_updated |= true; if (!cfg80211_chandef_compatible(&sdata->u.ibss.chandef, &chandef)) WARN_ON_ONCE(1); } if (sta && rates_updated) { u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED; u8 rx_nss = sta->sta.deflink.rx_nss; /* Force rx_nss recalculation */ sta->sta.deflink.rx_nss = 0; rate_control_rate_init(sta); if (sta->sta.deflink.rx_nss != rx_nss) changed |= IEEE80211_RC_NSS_CHANGED; drv_sta_rc_update(local, sdata, &sta->sta, changed); } rcu_read_unlock(); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems) { struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; struct ieee80211_bss *bss; struct ieee80211_channel *channel; u64 beacon_timestamp, rx_timestamp; u32 supp_rates = 0; enum nl80211_band band = rx_status->band; channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); if (!channel) return; ieee80211_update_sta_info(sdata, mgmt, len, rx_status, elems, channel); bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, channel); if (!bss) return; cbss = container_of((void *)bss, struct cfg80211_bss, priv); /* same for beacon and probe response */ beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp); /* check if we need to merge IBSS */ /* not an IBSS */ if (!(cbss->capability & WLAN_CAPABILITY_IBSS)) goto put_bss; /* different channel */ if (sdata->u.ibss.fixed_channel && sdata->u.ibss.chandef.chan != cbss->channel) goto put_bss; /* different SSID */ if (elems->ssid_len != sdata->u.ibss.ssid_len || memcmp(elems->ssid, sdata->u.ibss.ssid, sdata->u.ibss.ssid_len)) goto put_bss; /* process channel switch */ if (sdata->vif.bss_conf.csa_active || ieee80211_ibss_process_chanswitch(sdata, elems, true)) goto put_bss; /* same BSSID */ if (ether_addr_equal(cbss->bssid, sdata->u.ibss.bssid)) goto put_bss; /* we use a fixed BSSID */ if (sdata->u.ibss.fixed_bssid) goto put_bss; if (ieee80211_have_rx_timestamp(rx_status)) { /* time when timestamp field was received */ rx_timestamp = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); } else { /* * second best option: get current TSF * (will return -1 if not supported) */ rx_timestamp = drv_get_tsf(local, sdata); } ibss_dbg(sdata, "RX beacon SA=%pM BSSID=%pM TSF=0x%llx\n", mgmt->sa, mgmt->bssid, (unsigned long long)rx_timestamp); ibss_dbg(sdata, "\tBCN=0x%llx diff=%lld @%lu\n", (unsigned long long)beacon_timestamp, (unsigned long long)(rx_timestamp - beacon_timestamp), jiffies); if (beacon_timestamp > rx_timestamp) { ibss_dbg(sdata, "beacon TSF higher than local TSF - IBSS merge with BSSID %pM\n", mgmt->bssid); ieee80211_sta_join_ibss(sdata, bss); supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL); ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); rcu_read_unlock(); } put_bss: ieee80211_rx_bss_put(local, bss); } void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; enum nl80211_bss_scan_width scan_width; int band; /* * XXX: Consider removing the least recently used entry and * allow new one to be added. */ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n", sdata->name, addr); return; } if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) return; if (!ether_addr_equal(bssid, sdata->u.ibss.bssid)) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON_ONCE(!chanctx_conf)) { rcu_read_unlock(); return; } band = chanctx_conf->def.chan->band; scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); if (!sta) return; /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband, scan_width); spin_lock(&ifibss->incomplete_lock); list_add(&sta->list, &ifibss->incomplete_stations); spin_unlock(&ifibss->incomplete_lock); wiphy_work_queue(local->hw.wiphy, &sdata->work); } static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT; unsigned long exp_rsn = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT; mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time) || (time_is_before_jiffies(last_active + exp_rsn) && sta->sta_state != IEEE80211_STA_AUTHORIZED)) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n", sta->sta_state != IEEE80211_STA_AUTHORIZED ? "not authorized " : "", sta->sta.addr); ieee80211_send_deauth_disassoc(sdata, sta->sta.addr, ifibss->bssid, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, frame_buf); WARN_ON(__sta_info_destroy(sta)); } } mutex_unlock(&local->sta_mtx); } /* * This function is called with state == IEEE80211_IBSS_MLME_JOINED */ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; enum nl80211_bss_scan_width scan_width; sdata_assert_lock(sdata); mod_timer(&ifibss->timer, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); ieee80211_ibss_sta_expire(sdata); if (time_before(jiffies, ifibss->last_scan_completed + IEEE80211_IBSS_MERGE_INTERVAL)) return; if (ieee80211_sta_active_ibss(sdata)) return; if (ifibss->fixed_channel) return; sdata_info(sdata, "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n"); scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, NULL, 0, scan_width); } static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; u8 bssid[ETH_ALEN]; u16 capability; int i; sdata_assert_lock(sdata); if (ifibss->fixed_bssid) { memcpy(bssid, ifibss->bssid, ETH_ALEN); } else { /* Generate random, not broadcast, locally administered BSSID. Mix in * own MAC address to make sure that devices that do not have proper * random number generator get different BSSID. */ get_random_bytes(bssid, ETH_ALEN); for (i = 0; i < ETH_ALEN; i++) bssid[i] ^= sdata->vif.addr[i]; bssid[0] &= ~0x01; bssid[0] |= 0x02; } sdata_info(sdata, "Creating new IBSS network, BSSID %pM\n", bssid); capability = WLAN_CAPABILITY_IBSS; if (ifibss->privacy) capability |= WLAN_CAPABILITY_PRIVACY; __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, &ifibss->chandef, ifibss->basic_rates, capability, 0, true); } static unsigned int ibss_setup_channels(struct wiphy *wiphy, struct ieee80211_channel **channels, unsigned int channels_max, u32 center_freq, u32 width) { struct ieee80211_channel *chan = NULL; unsigned int n_chan = 0; u32 start_freq, end_freq, freq; if (width <= 20) { start_freq = center_freq; end_freq = center_freq; } else { start_freq = center_freq - width / 2 + 10; end_freq = center_freq + width / 2 - 10; } for (freq = start_freq; freq <= end_freq; freq += 20) { chan = ieee80211_get_channel(wiphy, freq); if (!chan) continue; if (n_chan >= channels_max) return n_chan; channels[n_chan] = chan; n_chan++; } return n_chan; } static unsigned int ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, struct ieee80211_channel **channels, unsigned int channels_max) { unsigned int n_chan = 0; u32 width, cf1, cf2 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_40: width = 40; break; case NL80211_CHAN_WIDTH_80P80: cf2 = chandef->center_freq2; fallthrough; case NL80211_CHAN_WIDTH_80: width = 80; break; case NL80211_CHAN_WIDTH_160: width = 160; break; default: width = 20; break; } cf1 = chandef->center_freq1; n_chan = ibss_setup_channels(wiphy, channels, channels_max, cf1, width); if (cf2) n_chan += ibss_setup_channels(wiphy, &channels[n_chan], channels_max - n_chan, cf2, width); return n_chan; } /* * This function is called with state == IEEE80211_IBSS_MLME_SEARCH */ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; struct ieee80211_channel *chan = NULL; const u8 *bssid = NULL; enum nl80211_bss_scan_width scan_width; int active_ibss; sdata_assert_lock(sdata); active_ibss = ieee80211_sta_active_ibss(sdata); ibss_dbg(sdata, "sta_find_ibss (active_ibss=%d)\n", active_ibss); if (active_ibss) return; if (ifibss->fixed_bssid) bssid = ifibss->bssid; if (ifibss->fixed_channel) chan = ifibss->chandef.chan; if (!is_zero_ether_addr(ifibss->bssid)) bssid = ifibss->bssid; cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); if (cbss) { struct ieee80211_bss *bss; bss = (void *)cbss->priv; ibss_dbg(sdata, "sta_find_ibss: selected %pM current %pM\n", cbss->bssid, ifibss->bssid); sdata_info(sdata, "Selected IBSS BSSID %pM based on configured SSID\n", cbss->bssid); ieee80211_sta_join_ibss(sdata, bss); ieee80211_rx_bss_put(local, bss); return; } /* if a fixed bssid and a fixed freq have been provided create the IBSS * directly and do not waste time scanning */ if (ifibss->fixed_bssid && ifibss->fixed_channel) { sdata_info(sdata, "Created IBSS using preconfigured BSSID %pM\n", bssid); ieee80211_sta_create_ibss(sdata); return; } ibss_dbg(sdata, "sta_find_ibss: did not try to join ibss\n"); /* Selected IBSS not found in current scan results - try to scan */ if (time_after(jiffies, ifibss->last_scan_completed + IEEE80211_SCAN_INTERVAL)) { struct ieee80211_channel *channels[8]; unsigned int num; sdata_info(sdata, "Trigger new scan to find an IBSS to join\n"); scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); if (ifibss->fixed_channel) { num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy, &ifibss->chandef, channels, ARRAY_SIZE(channels)); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, channels, num, scan_width); } else { ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, NULL, 0, scan_width); } } else { int interval = IEEE80211_SCAN_INTERVAL; if (time_after(jiffies, ifibss->ibss_join_req + IEEE80211_IBSS_JOIN_TIMEOUT)) ieee80211_sta_create_ibss(sdata); mod_timer(&ifibss->timer, round_jiffies(jiffies + interval)); } } static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *req) { struct ieee80211_mgmt *mgmt = (void *)req->data; struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; int tx_last_beacon, len = req->len; struct sk_buff *skb; struct beacon_data *presp; u8 *pos, *end; sdata_assert_lock(sdata); presp = sdata_dereference(ifibss->presp, sdata); if (ifibss->state != IEEE80211_IBSS_MLME_JOINED || len < 24 + 2 || !presp) return; tx_last_beacon = drv_tx_last_beacon(local); ibss_dbg(sdata, "RX ProbeReq SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); ibss_dbg(sdata, "\tBSSID=%pM (tx_last_beacon=%d)\n", mgmt->bssid, tx_last_beacon); if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da)) return; if (!ether_addr_equal(mgmt->bssid, ifibss->bssid) && !is_broadcast_ether_addr(mgmt->bssid)) return; end = ((u8 *) mgmt) + len; pos = mgmt->u.probe_req.variable; if (pos[0] != WLAN_EID_SSID || pos + 2 + pos[1] > end) { ibss_dbg(sdata, "Invalid SSID IE in ProbeReq from %pM\n", mgmt->sa); return; } if (pos[1] != 0 && (pos[1] != ifibss->ssid_len || memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) { /* Ignore ProbeReq for foreign SSID */ return; } /* Reply with ProbeResp */ skb = dev_alloc_skb(local->tx_headroom + presp->head_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, presp->head, presp->head_len); memcpy(((struct ieee80211_mgmt *) skb->data)->da, mgmt->sa, ETH_ALEN); ibss_dbg(sdata, "Sending ProbeResp to %pM\n", mgmt->sa); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; /* avoid excessive retries for probe request to wildcard SSIDs */ if (pos[1] == 0) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_NO_ACK; ieee80211_tx_skb(sdata, skb); } static void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { size_t baselen; struct ieee802_11_elems *elems; BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != offsetof(typeof(mgmt->u.beacon), variable)); /* * either beacon or probe_resp but the variable field is at the * same offset */ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; if (baselen > len) return; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, false, NULL); if (elems) { ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems); kfree(elems); } } void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status; struct ieee80211_mgmt *mgmt; u16 fc; struct ieee802_11_elems *elems; int ies_len; rx_status = IEEE80211_SKB_RXCB(skb); mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); sdata_lock(sdata); if (!sdata->u.ibss.ssid_len) goto mgmt_out; /* not ready to merge yet */ switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_PROBE_REQ: ieee80211_rx_mgmt_probe_req(sdata, skb); break; case IEEE80211_STYPE_PROBE_RESP: case IEEE80211_STYPE_BEACON: ieee80211_rx_mgmt_probe_beacon(sdata, mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_AUTH: ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DEAUTH: ieee80211_rx_mgmt_deauth_ibss(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ACTION: switch (mgmt->u.action.category) { case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); if (ies_len < 0) break; elems = ieee802_11_parse_elems( mgmt->u.action.u.chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, skb->len, rx_status, elems); kfree(elems); break; } } mgmt_out: sdata_unlock(sdata); } void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct sta_info *sta; sdata_lock(sdata); /* * Work could be scheduled after scan or similar * when we aren't even joined (or trying) with a * network. */ if (!ifibss->ssid_len) goto out; spin_lock_bh(&ifibss->incomplete_lock); while (!list_empty(&ifibss->incomplete_stations)) { sta = list_first_entry(&ifibss->incomplete_stations, struct sta_info, list); list_del(&sta->list); spin_unlock_bh(&ifibss->incomplete_lock); ieee80211_ibss_finish_sta(sta); rcu_read_unlock(); spin_lock_bh(&ifibss->incomplete_lock); } spin_unlock_bh(&ifibss->incomplete_lock); switch (ifibss->state) { case IEEE80211_IBSS_MLME_SEARCH: ieee80211_sta_find_ibss(sdata); break; case IEEE80211_IBSS_MLME_JOINED: ieee80211_sta_merge_ibss(sdata); break; default: WARN_ON(1); break; } out: sdata_unlock(sdata); } static void ieee80211_ibss_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.ibss.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; timer_setup(&ifibss->timer, ieee80211_ibss_timer, 0); INIT_LIST_HEAD(&ifibss->incomplete_stations); spin_lock_init(&ifibss->incomplete_lock); wiphy_work_init(&ifibss->csa_connection_drop_work, ieee80211_csa_connection_drop_work); } /* scan finished notification */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type != NL80211_IFTYPE_ADHOC) continue; sdata->u.ibss.last_scan_completed = jiffies; } mutex_unlock(&local->iflist_mtx); } int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { u64 changed = 0; u32 rate_flags; struct ieee80211_supported_band *sband; enum ieee80211_chanctx_mode chanmode; struct ieee80211_local *local = sdata->local; int radar_detect_width = 0; int i; int ret; if (params->chandef.chan->freq_offset) { /* this may work, but is untested */ return -EOPNOTSUPP; } ret = cfg80211_chandef_dfs_required(local->hw.wiphy, &params->chandef, sdata->wdev.iftype); if (ret < 0) return ret; if (ret > 0) { if (!params->userspace_handles_dfs) return -EINVAL; radar_detect_width = BIT(params->chandef.width); } chanmode = (params->channel_fixed && !ret) ? IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE; mutex_lock(&local->chanctx_mtx); ret = ieee80211_check_combinations(sdata, &params->chandef, chanmode, radar_detect_width); mutex_unlock(&local->chanctx_mtx); if (ret < 0) return ret; if (params->bssid) { memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN); sdata->u.ibss.fixed_bssid = true; } else sdata->u.ibss.fixed_bssid = false; sdata->u.ibss.privacy = params->privacy; sdata->u.ibss.control_port = params->control_port; sdata->u.ibss.userspace_handles_dfs = params->userspace_handles_dfs; sdata->u.ibss.basic_rates = params->basic_rates; sdata->u.ibss.last_scan_completed = jiffies; /* fix basic_rates if channel does not support these rates */ rate_flags = ieee80211_chandef_rate_flags(&params->chandef); sband = local->hw.wiphy->bands[params->chandef.chan->band]; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) sdata->u.ibss.basic_rates &= ~BIT(i); } memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate, sizeof(params->mcast_rate)); sdata->vif.bss_conf.beacon_int = params->beacon_interval; sdata->u.ibss.chandef = params->chandef; sdata->u.ibss.fixed_channel = params->channel_fixed; if (params->ie) { sdata->u.ibss.ie = kmemdup(params->ie, params->ie_len, GFP_KERNEL); if (sdata->u.ibss.ie) sdata->u.ibss.ie_len = params->ie_len; } sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; sdata->u.ibss.ibss_join_req = jiffies; memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len); sdata->u.ibss.ssid_len = params->ssid_len; memcpy(&sdata->u.ibss.ht_capa, &params->ht_capa, sizeof(sdata->u.ibss.ht_capa)); memcpy(&sdata->u.ibss.ht_capa_mask, &params->ht_capa_mask, sizeof(sdata->u.ibss.ht_capa_mask)); /* * 802.11n-2009 9.13.3.1: In an IBSS, the HT Protection field is * reserved, but an HT STA shall protect HT transmissions as though * the HT Protection field were set to non-HT mixed mode. * * In an IBSS, the RIFS Mode field of the HT Operation element is * also reserved, but an HT STA shall operate as though this field * were set to 1. */ sdata->vif.bss_conf.ht_operation_mode |= IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | IEEE80211_HT_PARAM_RIFS_MODE; changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE; ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = local->rx_chains; sdata->control_port_over_nl80211 = params->control_port_over_nl80211; wiphy_work_queue(local->hw.wiphy, &sdata->work); return 0; } int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; ieee80211_ibss_disconnect(sdata); ifibss->ssid_len = 0; eth_zero_addr(ifibss->bssid); /* remove beacon */ kfree(sdata->u.ibss.ie); sdata->u.ibss.ie = NULL; sdata->u.ibss.ie_len = 0; /* on the next join, re-program HT parameters */ memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa)); memset(&ifibss->ht_capa_mask, 0, sizeof(ifibss->ht_capa_mask)); synchronize_rcu(); skb_queue_purge(&sdata->skb_queue); del_timer_sync(&sdata->u.ibss.timer); return 0; }
131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 // SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record * is generated (defer as much work as possible to record * generation time): * i) context is allocated, * ii) names from getname are stored without a copy, and * iii) inode information stored from path_lookup. * 3) Ability to disable syscall auditing at boot time (audit=0). * 4) Usable by other parts of the kernel (if audit_log* is called, * then a syscall record will be generated automatically for the * current syscall). * 5) Netlink interface to user-space. * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * * Audit userspace, documentation, tests, and bug/issue trackers: * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> #include "audit.h" /* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; /** * struct audit_net - audit private network namespace data * @sk: communication socket */ struct audit_net { struct sock *sk; }; /** * struct auditd_connection - kernel/auditd connection state * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; }; static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static u32 audit_rate_limit; /* Number of outstanding audit_buffers allowed. * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 2) out of memory in audit_log_move [alloc_skb] 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ static atomic_t audit_lost = ATOMIC_INIT(0); /* Monotonically increasing sum of time the kernel has spent * waiting while the backlog limit is exceeded. */ static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; /* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); /* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .mask = -1, .features = 0, .lock = 0,}; static char *audit_feature_names[2] = { "only_unset_loginuid", "loginuid_immutable", }; /** * struct audit_ctl_mutex - serialize requests from userspace * @lock: the mutex used for locking * @owner: the task which owns the lock * * Description: * This is the lock struct used to ensure we only process userspace requests * in an orderly fashion. We can't simply use a mutex/lock here because we * need to track lock ownership so we don't end up blocking the lock owner in * audit_log_start() or similar. */ static struct audit_ctl_mutex { struct mutex lock; void *owner; } audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer * should be at least that large. */ #define AUDIT_BUFSIZ 1024 /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; }; struct audit_reply { __u32 portid; struct net *net; struct sk_buff *skb; }; /** * auditd_test_task - Check to see if a given task is an audit daemon * @task: the task to check * * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ int auditd_test_task(struct task_struct *task) { int rc; struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } /** * audit_ctl_lock - Take the audit control lock */ void audit_ctl_lock(void) { mutex_lock(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = current; } /** * audit_ctl_unlock - Drop the audit control lock */ void audit_ctl_unlock(void) { audit_cmd_mutex.owner = NULL; mutex_unlock(&audit_cmd_mutex.lock); } /** * audit_ctl_owner_current - Test to see if the current task owns the lock * * Description: * Return true if the current task owns the audit control lock, false if it * doesn't own the lock. */ static bool audit_ctl_owner_current(void) { return (current == audit_cmd_mutex.owner); } /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: * Returns the PID in relation to the namespace, 0 on failure. */ static pid_t auditd_pid_vnr(void) { pid_t pid; const struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac || !ac->pid) pid = 0; else pid = pid_vnr(ac->pid); rcu_read_unlock(); return pid; } /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * * Description: * Returns the sock pointer if valid, NULL otherwise. The caller must ensure * that a reference is held for the network namespace while the sock is in use. */ static struct sock *audit_get_sk(const struct net *net) { struct audit_net *aunet; if (!net) return NULL; aunet = net_generic(net, audit_net_id); return aunet->sk; } void audit_panic(const char *message) { switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: panic("audit: %s\n", message); break; } } static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int retval = 0; if (!audit_rate_limit) return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { retval = 1; } else { now = jiffies; if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; } } spin_unlock_irqrestore(&lock, flags); return retval; } /** * audit_log_lost - conditionally log lost audit message event * @message: the message stating reason for lost audit message * * Emit at least 1 message per second, even if audit_rate_check is * throttling. * Always increment the lost messages counter. */ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; atomic_inc(&audit_lost); print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } spin_unlock_irqrestore(&lock, flags); } if (print) { if (printk_ratelimit()) pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); audit_panic(message); } } static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { int allow_changes, rc = 0; u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) allow_changes = 0; else allow_changes = 1; if (audit_enabled != AUDIT_OFF) { rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } /* If we are allowed, make the change */ if (allow_changes == 1) *to_change = new; /* Not allowed, update reason */ else if (rc == 0) rc = -EPERM; return rc; } static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) { int rc; if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state); } /** * auditd_conn_free - RCU helper to release an auditd connection struct * @rcu: RCU head * * Description: * Drop any references inside the auditd connection tracking struct and free * the memory. */ static void auditd_conn_free(struct rcu_head *rcu) { struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); } /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ static int auditd_set(struct pid *pid, u32 portid, struct net *net) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; if (!pid || !net) return -EINVAL; ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); if (!ac_new) return -ENOMEM; ac_new->pid = get_pid(pid); ac_new->portid = portid; ac_new->net = get_net(net); spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); rcu_assign_pointer(auditd_conn, ac_new); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); return 0; } /** * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); } /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { /* put the record back in the queue */ skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this * function is called we haven't given up yet on sending the record, but things * are not looking good. The first thing we want to do is try to write the * record via printk and then see if we want to try and hold on to the record * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ if (!audit_default) goto drop; /* the hold queue is only for when the daemon goes away completely, * not -EAGAIN failures; if we are in a -EAGAIN state requeue the * record on the retry queue unless it's full, in which case drop it */ if (error == -EAGAIN) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } audit_log_lost("kauditd retry queue overflow"); goto drop; } /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); return; } /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } /* we have to drop the record, send it via printk as a last effort */ kauditd_printk_skb(skb); audit_log_lost("kauditd retry queue overflow"); kfree_skb(skb); } /** * auditd_reset - Disconnect the auditd connection * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the * hold queue in case auditd reconnects. It is important to note that the @ac * pointer should never be dereferenced inside this function as it may be NULL * or invalid, you can only compare the memory address! If @ac is NULL then * the connection will always be reset. */ static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); if (ac && ac != ac_old) { /* someone already registered a new auditd connection */ spin_unlock_irqrestore(&auditd_conn_lock, flags); return; } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb, -ECONNREFUSED); } /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * * Description: * Send a skb to the audit daemon, returns positive/zero values on success and * negative values on failure; in all cases the skb will be consumed by this * function. If the send results in -ECONNREFUSED the connection with auditd * will be reset. This function may sleep so callers should not hold any locks * where this would cause a problem. */ static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; u32 portid; struct net *net; struct sock *sk; struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local * copies of the namespace, the sock, and the portid; the * namespace and sock aren't going to go away while we hold a * reference and if the portid does become invalid after the RCU * section netlink_unicast() should safely return an error */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); kfree_skb(skb); rc = -ECONNREFUSED; goto err; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); put_net(net); if (rc < 0) goto err; return rc; err: if (ac && rc == -ECONNREFUSED) auditd_reset(ac); return rc; } /** * kauditd_send_queue - Helper for kauditd_thread to flush skb queues * @sk: the sending sock * @portid: the netlink destination * @queue: the skb queue to process * @retry_limit: limit on number of netlink unicast failures * @skb_hook: per-skb hook for additional processing * @err_hook: hook called if the skb fails the netlink unicast send * * Description: * Run through the given queue and attempt to send the audit records to auditd, * returns zero on success, negative values on failure. It is up to the caller * to ensure that the @sk is valid for the duration of this function. * */ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; struct sk_buff *skb = NULL; struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ skb_tail = skb_peek_tail(queue); while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) (*err_hook)(skb, -ECONNREFUSED); continue; } retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ continue; } else goto retry; } else { /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } return (rc >= 0 ? 0 : rc); } /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: * Write a multicast message to anyone listening in the initial network * namespace. This function doesn't consume an skb as might be expected since * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; /* NOTE: we are not taking an additional reference for init_net since * we don't have to worry about it going away */ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to * the skb by the original kaudit unicast socket send routine. The * existing auditd daemon assumes this breakage. Fixing this would * require co-ordinating a change in the established protocol between * the kaudit kernel subsystem and the auditd userspace code. There is * no reason for new multicast clients to continue with this * non-compliance. */ copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; nlh = nlmsg_hdr(copy); nlh->nlmsg_len = skb->len; nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } /** * kauditd_thread - Worker thread to send audit records to userspace * @dummy: unused */ static int kauditd_thread(void *dummy) { int rc; u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; struct auditd_connection *ac; #define UNICAST_RETRIES 5 set_freezable(); while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); goto main_queue; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } /* attempt to flush the retry queue */ rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, (sk ? kauditd_retry_skb : kauditd_hold_skb)); if (ac && rc < 0) auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); /* NOTE: we want to wake up if there is anything on the queue, * regardless of if an auditd is connected, as we need to * do the multicast send and rotate records from the * main queue to the retry/hold queues */ wait_event_freezable(kauditd_wait, (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; } int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ audit_ctl_lock(); audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); put_net(dest->net); kfree(dest); return 0; } struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; out_kfree_skb: kfree_skb(skb); return NULL; } static void audit_free_reply(struct audit_reply *reply) { if (!reply) return; kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); } static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); reply->skb = NULL; audit_free_reply(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag * @multi: multi-part message flag * @payload: payload data * @size: payload size * * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { struct task_struct *tsk; struct audit_reply *reply; reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; reply->skb = audit_make_reply(seq, type, done, multi, payload, size); if (!reply->skb) goto err; reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); if (IS_ERR(tsk)) goto err; return; err: audit_free_reply(reply); } /* * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users * configure their PAM stack (because that's what the distro does) * to reject login if unable to send messages to audit. If we return * ECONNREFUSED the PAM stack thinks the kernel does not have audit * configured in and will let login proceed. If we return EPERM * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: case AUDIT_ADD: case AUDIT_DEL: return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: case AUDIT_GET_FEATURE: case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ err = -EINVAL; } return err; } static void audit_log_common_recv_msg(struct audit_context *context, struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return; } *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } static inline void audit_log_user_recv_msg(struct audit_buffer **ab, u16 msg_type) { audit_log_common_recv_msg(NULL, ab, msg_type); } static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } static int audit_get_feature(struct sk_buff *skb) { u32 seq; seq = nlmsg_hdr(skb)->nlmsg_seq; audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, u32 old_lock, u32 new_lock, int res) { struct audit_buffer *ab; if (audit_enabled == AUDIT_OFF) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); } static int audit_set_feature(struct audit_features *uaf) { int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); /* if there is ever a version 2 we should handle that here */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature; /* are we changing a locked feature? */ if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; } } /* nothing invalid, do the changes */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; old_lock = af.lock & feature; new_lock = (uaf->lock | af.lock) & feature; if (new_feature != old_feature) audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 1); if (new_feature) af.features |= feature; else af.features &= ~feature; af.lock |= new_lock; } return 0; } static int audit_replace(struct pid *pid) { pid_t pvnr; struct sk_buff *skb; pvnr = pid_vnr(pid); skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; err = audit_netlink_ok(skb, msg_type); if (err) return err; seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_PID) { /* NOTE: we are using the vnr PID functions below * because the s.pid value is relative to the * namespace of the caller; at present this * doesn't matter much since you can really only * run auditd from the initial pid namespace, but * something to keep in mind if this changes */ pid_t new_pid = s.pid; pid_t auditd_pid; struct pid *req_pid = task_tgid(current); /* Sanity check - PID values must match. Setting * pid to 0 is how auditd ends auditing. */ if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); if (auditd_pid) { /* replacing a healthy auditd is not allowed */ if (new_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } /* only current auditd can unregister itself */ if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } } if (new_pid) { /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, sock_net(NETLINK_CB(skb).sk)); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, err ? 0 : 1); if (err) return err; /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); } else { if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, 1); /* unregister the auditd connection */ auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } if (s.mask == AUDIT_STATUS_LOST) { u32 lost = atomic_xchg(&audit_lost, 0); audit_log_config_change("lost", 0, lost, 1); return lost; } if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); return actual; } break; } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) return err; break; case AUDIT_SET_FEATURE: if (data_len < sizeof(struct audit_features)) return -EINVAL; err = audit_set_feature(data); if (err) return err; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; /* exit early if there isn't at least one character to print */ if (data_len < 2) return -EINVAL; err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ char *str = data; err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); if (err) break; } audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) { /* ensure NULL termination */ str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, str); } else { audit_log_format(ab, " data="); if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=%s audit_enabled=%d res=0", msg_type == AUDIT_ADD_RULE ? "add_rule" : "remove_rule", audit_enabled); audit_log_end(ab); return -EPERM; } err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; size_t msglen = data_len; char *old, *new; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); msglen -= 2 * sizeof(u32); old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); break; } /* OK, here comes... */ err = audit_tag_tree(old, new); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); audit_log_untrustedstring(ab, new); audit_log_format(ab, " res=%d", !err); audit_log_end(ab); kfree(old); kfree(new); break; } case AUDIT_SIGNAL_INFO: len = 0; if (audit_sig_sid) { err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); if (err) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { if (audit_sig_sid) security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (audit_sig_sid) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, struct_size(sig_data, ctx, len)); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; unsigned int t; t = READ_ONCE(current->signal->audit_tty); s.enabled = t & AUDIT_TTY_ENABLE; s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; struct audit_buffer *ab; unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; if (err) t = READ_ONCE(current->signal->audit_tty); else { t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); t = xchg(&current->signal->audit_tty, t); } old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, s.log_passwd, !err); audit_log_end(ab); break; } default: err = -EINVAL; break; } return err < 0 ? err : 0; } /** * audit_receive - receive messages from a netlink control socket * @skb: the message buffer * * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; int err; nlh = nlmsg_hdr(skb); len = skb->len; audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); /* can't block with the ctrl lock, so penalize the sender now */ if (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current); /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(audit_backlog_wait_time); remove_wait_queue(&audit_backlog_wait, &wait); } } /* Log information about who is connecting to the audit multicast socket */ static void audit_log_multicast(int group, const char *op, int err) { const struct cred *cred; struct tty_struct *tty; char comm[sizeof(current->comm)]; struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); /* exe= */ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); audit_log_end(ab); } /* Run custom bind function on netlink socket group connect or bind requests. */ static int audit_multicast_bind(struct net *net, int group) { int err = 0; if (!capable(CAP_AUDIT_READ)) err = -EPERM; audit_log_multicast(group, "connect", err); return err; } static void audit_multicast_unbind(struct net *net, int group) { audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_multicast_bind, .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } /* limit the timeout in case auditd is blocked/stopped */ aunet->sk->sk_sndtimeo = HZ / 10; return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); /* NOTE: you would think that we would want to check the auditd * connection and potentially reset it here if it lives in this * namespace, but since the auditd connection tracking struct holds a * reference to this namespace (see auditd_set()) we are only ever * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { .init = audit_net_init, .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), }; /* Initialize audit support at boot time. */ static int __init audit_init(void) { int i; if (audit_initialized == AUDIT_DISABLED) return 0; audit_buffer_cache = kmem_cache_create("audit_buffer", sizeof(struct audit_buffer), 0, SLAB_PANIC, NULL); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); mutex_init(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "state=initialized audit_enabled=%u res=1", audit_enabled); return 0; } postcore_initcall(audit_init); /* * Process kernel command-line parameter at boot time. * audit={0|off} or audit={1|on}. */ static int __init audit_enable(char *str) { if (!strcasecmp(str, "off") || !strcmp(str, "0")) audit_default = AUDIT_OFF; else if (!strcasecmp(str, "on") || !strcmp(str, "1")) audit_default = AUDIT_ON; else { pr_err("audit: invalid 'audit' parameter value (%s)\n", str); audit_default = AUDIT_ON; } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); return 1; } __setup("audit=", audit_enable); /* Process kernel command-line parameter at boot time. * audit_backlog_limit=<n> */ static int __init audit_backlog_limit_set(char *str) { u32 audit_backlog_limit_arg; pr_info("audit_backlog_limit: "); if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { pr_cont("using default of %u, unable to parse %s\n", audit_backlog_limit, str); return 1; } audit_backlog_limit = audit_backlog_limit_arg; pr_cont("%d\n", audit_backlog_limit); return 1; } __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { if (!ab) return; kfree_skb(ab->skb); kmem_cache_free(audit_buffer_cache, ab); } static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); if (!ab) return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; ab->ctx = ctx; ab->gfp_mask = gfp_mask; return ab; err: audit_buffer_free(ab); return NULL; } /** * audit_serial - compute a serial number for the audit record * * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to * determine which pieces belong to the same audit record. The * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) * @gfp_mask: type of allocation * @type: audit message type * * Returns audit_buffer pointer on success or NULL on error. * * Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the task (ctx) is a task that is currently in a * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, then * task context (ctx) should be NULL. */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; struct timespec64 t; unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex, although we do penalize the sender * later in audit_receive() when it is safe to block */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { long rtime = stime; DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); stime = schedule_timeout(rtime); atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } } } ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } audit_get_stamp(ab->ctx, &t, &serial); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. */ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int oldtail = skb_tailroom(skb); int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); int newtail = skb_tailroom(skb); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; } skb->truesize += newtail - oldtail; return newtail; } /* * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; va_list args2; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); if (avail == 0) { avail = audit_expand(ab, AUDIT_BUFSIZ); if (!avail) goto out; } va_copy(args2, args); len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } if (len > 0) skb_put(skb, len); out_va_end: va_end(args2); out: return; } /** * audit_log_format - format a message into the audit buffer. * @ab: audit_buffer * @fmt: format string * @...: optional parameters matching @fmt string * * All the work is done in audit_log_vformat. */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; if (!ab) return; va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); } /** * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted * * No return value; failure to expand is silently ignored. * * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = len<<1; if (new_len >= avail) { /* Round the buffer request up to the next multiple */ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); for (i = 0; i < len; i++) ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } /* * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ void audit_log_n_string(struct audit_buffer *ab, const char *string, size_t slen) { int avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = slen + 3; /* enclosing quotes + null terminator */ if (new_len > avail) { avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); *ptr++ = '"'; memcpy(ptr, string, slen); ptr += slen; *ptr++ = '"'; *ptr = 0; skb_put(skb, slen + 2); /* don't include null terminator */ } /** * audit_string_contains_control - does a string need to be logged in hex * @string: string to be checked * @len: max length of the string to check */ bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) return true; } return false; } /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). * * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t len) { if (audit_string_contains_control(string, len)) audit_log_n_hex(ab, string, len); else audit_log_n_string(ab, string, len); } /** * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { char *p, *pathname; if (prefix) audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); } void audit_log_session_info(struct audit_buffer *ab) { unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); if (key) audit_log_untrustedstring(ab, key); else audit_log_format(ab, "(null)"); } int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; unsigned len; int error; u32 sid; security_current_getsecid_subj(&sid); if (!sid) return 0; error = security_secid_to_secctx(sid, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; return 0; } audit_log_format(ab, " subj=%s", ctx); security_release_secctx(ctx, len); return 0; error_path: audit_panic("error in audit_log_task_context"); return error; } EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { struct file *exe_file; if (!mm) goto out_null; exe_file = get_mm_exe_file(mm); if (!exe_file) goto out_null; audit_log_d_path(ab, " exe=", &exe_file->f_path); fput(exe_file); return; out_null: audit_log_format(ab, " exe=(null)"); } struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); if (current->signal) tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } void audit_put_tty(struct tty_struct *tty) { tty_kref_put(tty); } void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_path_denied - report a path restriction denial * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) * @operation: specific operation name */ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled || audit_dummy_context()) return; /* Generate log with subject, operation, outcome. */ ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; /* reject if this is not an unset and we don't allow that */ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) return -EPERM; return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, unsigned int oldsessionid, unsigned int sessionid, int rc) { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; struct tty_struct *tty; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", oldsessionid, sessionid, !rc); audit_put_tty(tty); audit_log_end(ab); } /** * audit_set_loginuid - set current task's loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ int audit_set_loginuid(kuid_t loginuid) { unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); oldsessionid = audit_get_sessionid(current); rc = audit_set_loginuid_perm(loginuid); if (rc) goto out; /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } current->sessionid = sessionid; current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; } /** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info(int sig, struct task_struct *t) { kuid_t uid = current_uid(), auid; if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { audit_sig_pid = task_tgid_nr(current); auid = audit_get_loginuid(current); if (uid_valid(auid)) audit_sig_uid = auid; else audit_sig_uid = uid; security_current_getsecid_subj(&audit_sig_sid); } return audit_signal_info_syscall(t); } /** * audit_log_end - end one audit record * @ab: the audit_buffer * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; struct nlmsghdr *nlh; if (!ab) return; if (audit_rate_check()) { skb = ab->skb; ab->skb = NULL; /* setup the netlink header, see the comments in * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; /* queue the netlink packet and poke the kauditd thread */ skb_queue_tail(&audit_queue, skb); wake_up_interruptible(&kauditd_wait); } else audit_log_lost("rate limit exceeded"); audit_buffer_free(ab); } /** * audit_log - Log an audit record * @ctx: audit context * @gfp_mask: type of allocation * @type: audit message type * @fmt: format string to use * @...: variable parameters matching the format string * * This is a convenience function that calls audit_log_start, * audit_log_vformat, and audit_log_end. It may be called * in any context. */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); audit_log_end(ab); } } EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); EXPORT_SYMBOL(audit_log);
32 32 14 64 64 64 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 #ifndef __NET_SCHED_CODEL_IMPL_H #define __NET_SCHED_CODEL_IMPL_H /* * Codel - The Controlled-Delay Active Queue Management algorithm * * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ /* Controlling Queue Delay (CoDel) algorithm * ========================================= * Source : Kathleen Nichols and Van Jacobson * http://queue.acm.org/detail.cfm?id=2209336 * * Implemented on linux by Dave Taht and Eric Dumazet */ #include <net/inet_ecn.h> static void codel_params_init(struct codel_params *params) { params->interval = MS2TIME(100); params->target = MS2TIME(5); params->ce_threshold = CODEL_DISABLED_THRESHOLD; params->ce_threshold_mask = 0; params->ce_threshold_selector = 0; params->ecn = false; } static void codel_vars_init(struct codel_vars *vars) { memset(vars, 0, sizeof(*vars)); } static void codel_stats_init(struct codel_stats *stats) { stats->maxpacket = 0; } /* * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ static void codel_Newton_step(struct codel_vars *vars) { u32 invsqrt = ((u32)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT; u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2); val >>= 2; /* avoid overflow in following multiply */ val = (val * invsqrt) >> (32 - 2 + 1); vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT; } /* * CoDel control_law is t + interval/sqrt(count) * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid * both sqrt() and divide operation. */ static codel_time_t codel_control_law(codel_time_t t, codel_time_t interval, u32 rec_inv_sqrt) { return t + reciprocal_scale(interval, rec_inv_sqrt << REC_INV_SQRT_SHIFT); } static bool codel_should_drop(const struct sk_buff *skb, void *ctx, struct codel_vars *vars, struct codel_params *params, struct codel_stats *stats, codel_skb_len_t skb_len_func, codel_skb_time_t skb_time_func, u32 *backlog, codel_time_t now) { bool ok_to_drop; u32 skb_len; if (!skb) { vars->first_above_time = 0; return false; } skb_len = skb_len_func(skb); vars->ldelay = now - skb_time_func(skb); if (unlikely(skb_len > stats->maxpacket)) stats->maxpacket = skb_len; if (codel_time_before(vars->ldelay, params->target) || *backlog <= params->mtu) { /* went below - stay below for at least interval */ vars->first_above_time = 0; return false; } ok_to_drop = false; if (vars->first_above_time == 0) { /* just went above from below. If we stay above * for at least interval we'll say it's ok to drop */ vars->first_above_time = now + params->interval; } else if (codel_time_after(now, vars->first_above_time)) { ok_to_drop = true; } return ok_to_drop; } static struct sk_buff *codel_dequeue(void *ctx, u32 *backlog, struct codel_params *params, struct codel_vars *vars, struct codel_stats *stats, codel_skb_len_t skb_len_func, codel_skb_time_t skb_time_func, codel_skb_drop_t drop_func, codel_skb_dequeue_t dequeue_func) { struct sk_buff *skb = dequeue_func(vars, ctx); codel_time_t now; bool drop; if (!skb) { vars->dropping = false; return skb; } now = codel_get_time(); drop = codel_should_drop(skb, ctx, vars, params, stats, skb_len_func, skb_time_func, backlog, now); if (vars->dropping) { if (!drop) { /* sojourn time below target - leave dropping state */ vars->dropping = false; } else if (codel_time_after_eq(now, vars->drop_next)) { /* It's time for the next drop. Drop the current * packet and dequeue the next. The dequeue might * take us out of dropping state. * If not, schedule the next drop. * A large backlog might result in drop rates so high * that the next drop should happen now, * hence the while loop. */ while (vars->dropping && codel_time_after_eq(now, vars->drop_next)) { vars->count++; /* dont care of possible wrap * since there is no more divide */ codel_Newton_step(vars); if (params->ecn && INET_ECN_set_ce(skb)) { stats->ecn_mark++; vars->drop_next = codel_control_law(vars->drop_next, params->interval, vars->rec_inv_sqrt); goto end; } stats->drop_len += skb_len_func(skb); drop_func(skb, ctx); stats->drop_count++; skb = dequeue_func(vars, ctx); if (!codel_should_drop(skb, ctx, vars, params, stats, skb_len_func, skb_time_func, backlog, now)) { /* leave dropping state */ vars->dropping = false; } else { /* and schedule the next drop */ vars->drop_next = codel_control_law(vars->drop_next, params->interval, vars->rec_inv_sqrt); } } } } else if (drop) { u32 delta; if (params->ecn && INET_ECN_set_ce(skb)) { stats->ecn_mark++; } else { stats->drop_len += skb_len_func(skb); drop_func(skb, ctx); stats->drop_count++; skb = dequeue_func(vars, ctx); drop = codel_should_drop(skb, ctx, vars, params, stats, skb_len_func, skb_time_func, backlog, now); } vars->dropping = true; /* if min went above target close to when we last went below it * assume that the drop rate that controlled the queue on the * last cycle is a good starting point to control it now. */ delta = vars->count - vars->lastcount; if (delta > 1 && codel_time_before(now - vars->drop_next, 16 * params->interval)) { vars->count = delta; /* we dont care if rec_inv_sqrt approximation * is not very precise : * Next Newton steps will correct it quadratically. */ codel_Newton_step(vars); } else { vars->count = 1; vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT; } vars->lastcount = vars->count; vars->drop_next = codel_control_law(now, params->interval, vars->rec_inv_sqrt); } end: if (skb && codel_time_after(vars->ldelay, params->ce_threshold)) { bool set_ce = true; if (params->ce_threshold_mask) { int dsfield = skb_get_dsfield(skb); set_ce = (dsfield >= 0 && (((u8)dsfield & params->ce_threshold_mask) == params->ce_threshold_selector)); } if (set_ce && INET_ECN_set_ce(skb)) stats->ce_mark++; } return skb; } #endif
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0 /* User-mappable watch queue * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/core-api/watch_queue.rst */ #ifndef _LINUX_WATCH_QUEUE_H #define _LINUX_WATCH_QUEUE_H #include <uapi/linux/watch_queue.h> #include <linux/kref.h> #include <linux/rcupdate.h> #ifdef CONFIG_WATCH_QUEUE struct cred; struct watch_type_filter { enum watch_notification_type type; __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */ __u32 info_filter; /* Filter on watch_notification::info */ __u32 info_mask; /* Mask of relevant bits in info_filter */ }; struct watch_filter { union { struct rcu_head rcu; /* Bitmask of accepted types */ DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; }; struct watch_queue { struct rcu_head rcu; struct watch_filter __rcu *filter; struct pipe_inode_info *pipe; /* Pipe we use as a buffer, NULL if queue closed */ struct hlist_head watches; /* Contributory watches */ struct page **notes; /* Preallocated notifications */ unsigned long *notes_bitmap; /* Allocation bitmap for notes */ struct kref usage; /* Object usage count */ spinlock_t lock; unsigned int nr_notes; /* Number of notes */ unsigned int nr_pages; /* Number of pages in notes[] */ }; /* * Representation of a watch on an object. */ struct watch { union { struct rcu_head rcu; u32 info_id; /* ID to be OR'd in to info field */ }; struct watch_queue __rcu *queue; /* Queue to post events to */ struct hlist_node queue_node; /* Link in queue->watches */ struct watch_list __rcu *watch_list; struct hlist_node list_node; /* Link in watch_list->watchers */ const struct cred *cred; /* Creds of the owner of the watch */ void *private; /* Private data for the watched object */ u64 id; /* Internal identifier */ struct kref usage; /* Object usage count */ }; /* * List of watches on an object. */ struct watch_list { struct rcu_head rcu; struct hlist_head watchers; void (*release_watch)(struct watch *); spinlock_t lock; }; extern void __post_watch_notification(struct watch_list *, struct watch_notification *, const struct cred *, u64); extern struct watch_queue *get_watch_queue(int); extern void put_watch_queue(struct watch_queue *); extern void init_watch(struct watch *, struct watch_queue *); extern int add_watch_to_object(struct watch *, struct watch_list *); extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); extern long watch_queue_set_filter(struct pipe_inode_info *, struct watch_notification_filter __user *); extern int watch_queue_init(struct pipe_inode_info *); extern void watch_queue_clear(struct watch_queue *); static inline void init_watch_list(struct watch_list *wlist, void (*release_watch)(struct watch *)) { INIT_HLIST_HEAD(&wlist->watchers); spin_lock_init(&wlist->lock); wlist->release_watch = release_watch; } static inline void post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { if (unlikely(wlist)) __post_watch_notification(wlist, n, cred, id); } static inline void remove_watch_list(struct watch_list *wlist, u64 id) { if (wlist) { remove_watch_from_object(wlist, NULL, id, true); kfree_rcu(wlist, rcu); } } /** * watch_sizeof - Calculate the information part of the size of a watch record, * given the structure size. */ #define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT) #else static inline int watch_queue_init(struct pipe_inode_info *pipe) { return -ENOPKG; } #endif #endif /* _LINUX_WATCH_QUEUE_H */
16180 16178 16186 256 16182 4075 4076 142 142 135 1 135 130 126 127 57 135 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/uaccess.h> #include <linux/nospec.h> /* out-of-line parts */ #ifndef INLINE_COPY_FROM_USER unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { /* * Ensure that bad access_ok() speculation will not * lead to nasty side effects *after* the copy is * finished: */ barrier_nospec(); instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); return res; } EXPORT_SYMBOL(_copy_from_user); #endif #ifndef INLINE_COPY_TO_USER unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (likely(access_ok(to, n))) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } EXPORT_SYMBOL(_copy_to_user); #endif /** * check_zeroed_user: check if a userspace buffer only contains zero bytes * @from: Source address, in userspace. * @size: Size of buffer. * * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for * userspace addresses (and is more efficient because we don't care where the * first non-zero byte is). * * Returns: * * 0: There were non-zero bytes present in the buffer. * * 1: The buffer was full of zero bytes. * * -EFAULT: access to userspace failed. */ int check_zeroed_user(const void __user *from, size_t size) { unsigned long val; uintptr_t align = (uintptr_t) from % sizeof(unsigned long); if (unlikely(size == 0)) return 1; from -= align; size += align; if (!user_read_access_begin(from, size)) return -EFAULT; unsafe_get_user(val, (unsigned long __user *) from, err_fault); if (align) val &= ~aligned_byte_mask(align); while (size > sizeof(unsigned long)) { if (unlikely(val)) goto done; from += sizeof(unsigned long); size -= sizeof(unsigned long); unsafe_get_user(val, (unsigned long __user *) from, err_fault); } if (size < sizeof(unsigned long)) val &= aligned_byte_mask(size); done: user_read_access_end(); return (val == 0); err_fault: user_read_access_end(); return -EFAULT; } EXPORT_SYMBOL(check_zeroed_user);
73 73 73 73 72 73 73 73 73 73 72 73 73 73 73 73 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC */ #include <crypto/hash.h> #include <crypto/sha2.h> #include "fscrypt_private.h" /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses * SHA-512 because it is well-established, secure, and reasonably efficient. * * HKDF-SHA256 was also considered, as its 256-bit security strength would be * sufficient here. A 512-bit security strength is "nice to have", though. * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the * common case of deriving an AES-256-XTS key (512 bits), that can result in * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* * HKDF consists of two steps: * * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from * the input keying material and optional salt. * 2. HKDF-Expand: expand the pseudorandom key into output keying material of * any length, parameterized by an application-specific info string. * * HKDF-Extract can be skipped if the input is already a pseudorandom key of * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take * shorter keys, and we don't want to force users of those modes to provide * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No * salt is used, since fscrypt master keys should already be pseudorandom and * there's no way to persist a random salt per master key from kernel mode. */ /* HKDF-Extract (RFC 5869 section 2.2), unsalted */ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. * * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many * times without having to recompute HKDF-Extract each time. */ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; u8 prk[HKDF_HASHLEN]; int err; hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); if (IS_ERR(hmac_tfm)) { fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", PTR_ERR(hmac_tfm)); return PTR_ERR(hmac_tfm); } if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { err = -EINVAL; goto err_free_tfm; } err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); if (err) goto err_free_tfm; err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); if (err) goto err_free_tfm; hkdf->hmac_tfm = hmac_tfm; goto out; err_free_tfm: crypto_free_shash(hmac_tfm); out: memzero_explicit(prk, sizeof(prk)); return err; } /* * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. * * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); u8 prefix[9]; unsigned int i; int err; const u8 *prev = NULL; u8 counter = 1; u8 tmp[HKDF_HASHLEN]; if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) return -EINVAL; desc->tfm = hkdf->hmac_tfm; memcpy(prefix, "fscrypt\0", 8); prefix[8] = context; for (i = 0; i < okmlen; i += HKDF_HASHLEN) { err = crypto_shash_init(desc); if (err) goto out; if (prev) { err = crypto_shash_update(desc, prev, HKDF_HASHLEN); if (err) goto out; } err = crypto_shash_update(desc, prefix, sizeof(prefix)); if (err) goto out; err = crypto_shash_update(desc, info, infolen); if (err) goto out; BUILD_BUG_ON(sizeof(counter) != 1); if (okmlen - i < HKDF_HASHLEN) { err = crypto_shash_finup(desc, &counter, 1, tmp); if (err) goto out; memcpy(&okm[i], tmp, okmlen - i); memzero_explicit(tmp, sizeof(tmp)); } else { err = crypto_shash_finup(desc, &counter, 1, &okm[i]); if (err) goto out; } counter++; prev = &okm[i]; } err = 0; out: if (unlikely(err)) memzero_explicit(okm, okmlen); /* so caller doesn't need to */ shash_desc_zero(desc); return err; } void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) { crypto_free_shash(hkdf->hmac_tfm); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Stubs for the Network PHY library */ #include <linux/rtnetlink.h> struct kernel_hwtstamp_config; struct netlink_ext_ack; struct phy_device; #if IS_ENABLED(CONFIG_PHYLIB) extern const struct phylib_stubs *phylib_stubs; struct phylib_stubs { int (*hwtstamp_get)(struct phy_device *phydev, struct kernel_hwtstamp_config *config); int (*hwtstamp_set)(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); }; static inline int phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config) { /* phylib_register_stubs() and phylib_unregister_stubs() * also run under rtnl_lock(). */ ASSERT_RTNL(); if (!phylib_stubs) return -EOPNOTSUPP; return phylib_stubs->hwtstamp_get(phydev, config); } static inline int phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { /* phylib_register_stubs() and phylib_unregister_stubs() * also run under rtnl_lock(). */ ASSERT_RTNL(); if (!phylib_stubs) return -EOPNOTSUPP; return phylib_stubs->hwtstamp_set(phydev, config, extack); } #else static inline int phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config) { return -EOPNOTSUPP; } static inline int phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } #endif
404 152 152 29 242 47 49 47 29 29 50 28 51 51 49 162 3 57 362 3 183 472 429 231 228 228 228 2 2 2 118 120 7 217 217 215 215 215 215 215 1 215 215 215 215 217 217 217 214 199 217 217 4 217 217 217 217 228 228 228 228 7 7 7 7 7 67 53 56 56 56 56 56 56 56 56 56 228 227 228 67 215 215 215 215 215 215 228 56 215 215 215 215 215 230 230 30 200 200 200 230 230 230 230 30 157 126 126 126 126 126 135 157 152 152 152 55 152 160 160 160 160 93 304 136 304 411 48 411 17 17 15 15 17 17 17 17 17 152 160 32 32 103 152 121 152 160 135 135 135 230 211 256 228 228 228 228 218 218 218 218 218 218 218 218 218 217 218 218 145 218 217 253 218 145 217 217 217 217 217 217 2 2 253 218 30 30 1 253 218 253 218 228 228 228 228 228 228 228 228 228 228 253 253 228 228 228 39 218 218 218 218 253 228 228 228 228 233 232 231 231 231 16 230 208 230 3 230 230 25 25 25 22 25 48 48 48 48 3 3 48 25 25 160 159 160 160 57 160 160 160 160 160 159 160 25 25 25 25 25 17 25 1 25 24 25 19 46 46 46 42 42 42 31 31 24 23 23 46 41 44 44 31 20 20 44 22 6 19 17 43 24 33 42 2 2 1 1 1 1 1 4 4 5 2 2 1 1 1 2 2 2 1 1 1 2 13 13 13 11 12 7 2 4 10 11 32 10 8 7 7 10 1 1 1 1 1 1 3 2 1 1 4 287 287 10 12 20 20 482 483 483 485 56 47 47 8 7 6 55 152 152 152 152 152 152 46 157 157 157 157 157 157 157 157 157 217 29 29 3 28 287 286 188 188 286 286 223 223 223 223 223 86 86 48 48 48 48 48 46 46 3 1 51 51 51 51 18 18 18 51 18 18 18 1 51 18 51 51 18 18 18 18 18 18 18 51 51 18 18 51 51 3 51 18 51 18 18 50 50 51 1 30 30 30 30 30 30 30 30 31 31 28 30 31 31 29 2 152 152 152 152 101 152 152 152 138 152 152 152 152 152 152 152 152 152 135 135 135 135 135 135 135 1 135 135 1 1 57 135 135 103 135 135 135 135 135 136 136 135 135 135 135 3 96 93 93 17 17 17 17 1184 44 73 73 73 44 44 43 73 12 74 69 3149 3149 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 /* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "cgroup-internal.h" #include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/hashtable.h> #include <linux/idr.h> #include <linux/kthread.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added. */ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); #ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); /* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding. */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_destroy_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of cgroup subsystem names */ #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ #define SUBSYS(_x) \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, static struct static_key_true *cgroup_subsys_enabled_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, static struct static_key_true *cgroup_subsys_on_dfl_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ static bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); /* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex. */ static u64 css_serial_nr_next = 1; /* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { #ifdef CONFIG_PSI OPT_FEATURE_PRESSURE, #endif OPT_FEATURE_COUNT }; static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { #ifdef CONFIG_PSI "pressure", #endif }; static u16 cgroup_feature_disable_mask __read_mostly; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline #define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); #include <linux/cgroup_refcnt.h> #endif /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ bool cgroup_ssid_enabled(int ssid) { if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) { int ret; idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); spin_unlock_bh(&cgroup_idr_lock); } static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } /* can @cgrp host both domain and threaded children? */ static bool cgroup_is_mixable(struct cgroup *cgrp) { /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time. */ return !cgroup_parent(cgrp); } /* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; /* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) return false; /* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) return false; /* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return false; return true; } /* is @cgrp root of a threaded subtree? */ static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) return false; /* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) return true; /* * A domain which has tasks and explicit threaded controllers * enabled is a thread root. */ if (cgroup_has_tasks(cgrp) && (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) return true; return false; } /* a domain which isn't connected to the root w/o brekage can't be used */ static bool cgroup_is_valid_domain(struct cgroup *cgrp) { /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) return false; /* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) return false; if (cgroup_is_threaded(cgrp)) return false; } return true; } /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { u16 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | cgrp_dfl_implicit_ss_mask); return root_ss_mask; } /* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { u16 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } return cgrp->root->subsys_mask; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->self; } /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); if (!ss) return &cgrp->self; /* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask. */ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL; } return cgroup_css(cgrp, ss); } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; do { css = cgroup_css(cgrp, ss); if (css) return css; cgrp = cgroup_parent(cgrp); } while (cgrp); return init_css_set.subsys[ss->id]; } /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put(). */ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; rcu_read_lock(); do { css = cgroup_css(cgrp, ss); if (css && css_tryget_online(css)) goto out_unlock; cgrp = cgroup_parent(cgrp); } while (cgrp); css = init_css_set.subsys[ss->id]; css_get(css); out_unlock: rcu_read_unlock(); return css; } EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); cgroup_get(cgrp); } /** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question */ int __cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += link->cset->nr_tasks; return count; } /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ int cgroup_task_count(const struct cgroup *cgrp) { int count; spin_lock_irq(&css_set_lock); count = __cgroup_task_count(cgrp); spin_unlock_irq(&css_set_lock); return count; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set. */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ (ss) = cgroup_subsys[ssid]; \ { #define while_each_subsys_mask() \ } \ } \ } while (false) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else /* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot. */ .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ static bool css_set_threaded(struct css_set *cset) { return cset->dom_cset != cset; } /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } /** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { struct cgroup *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { bool was_populated = cgroup_is_populated(cgrp); if (!child) { cgrp->nr_populated_csets += adj; } else { if (cgroup_is_threaded(child)) cgrp->nr_populated_threaded_children += adj; else cgrp->nr_populated_domain_children += adj; } if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } /** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); } /* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details. */ static void css_set_skip_task_iters(struct css_set *cset, struct task_struct *task) { struct css_task_iter *it, *pos; list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) css_task_iter_skip(it, task); } /** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); } else { WARN_ON_ONCE(!list_empty(&task->cg_list)); } if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } } /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&css_set_lock); if (!refcount_dec_and_test(&cset->refcount)) return; WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); if (cgroup_parent(link->cgrp)) cgroup_put(link->cgrp); kfree(link); } if (css_set_threaded(cset)) { list_del(&cset->threaded_csets_node); put_css_set_locked(cset->dom_cset); } kfree_rcu(cset, rcu_head); } /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match. */ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; /* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp)) new_dfl_cgrp = new_cgrp; else new_dfl_cgrp = old_cset->dfl_cgrp; if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) return false; /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary. */ l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) { BUG_ON(l2 != &old_cset->cgrp_links); break; } else { BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); cgrp1 = link1->cgrp; cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) return false; } else { if (cgrp1 != cgrp2) return false; } } return true; } /** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want * to change the css. */ template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); hash_for_each_possible(css_set_table, cset, hlist, key) { if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ return cset; } /* No existing cgroup group matched */ return NULL; } static void free_cgrp_cset_links(struct list_head *links_to_free) { struct cgrp_cset_link *link, *tmp_link; list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { list_del(&link->cset_link); kfree(link); } } /** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno. */ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { struct cgrp_cset_link *link; int i; INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; } list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) { struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); if (cgroup_on_dfl(cgrp)) cset->dfl_cgrp = cgrp; link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; /* * Always add links to the tail of the lists so that the lists are * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) cgroup_get_live(cgrp); } /** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsigned long key; int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); spin_unlock_irq(&css_set_lock); if (cset) return cset; cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } refcount_set(&cset->refcount, 1); cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_src_preload_node); INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; link_css_set(&tmp_links, cset, c); } BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cset->subsys[ssid]; list_add_tail(&cset->e_cset_node[ssid], &css->cgroup->e_csets[ssid]); css_get(css); } spin_unlock_irq(&css_set_lock); /* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return. */ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset; dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); if (!dcset) { put_css_set(cset); return NULL; } spin_lock_irq(&css_set_lock); cset->dom_cset = dcset; list_add_tail(&cset->threaded_csets_node, &dcset->threaded_csets); spin_unlock_irq(&css_set_lock); } return cset; } struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ if (favor && !favoring) { rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } } static int cgroup_init_root_id(struct cgroup_root *root) { int id; lockdep_assert_held(&cgroup_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; root->hierarchy_id = id; return 0; } static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } void cgroup_free_root(struct cgroup_root *root) { kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; trace_cgroup_destroy_root(root); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); cgroup_root_count--; } cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); cgroup_unlock(); cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* * Returned cgroup is without refcount but it's valid as long as cset pins it. */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res_cgroup = NULL; if (cset == &init_css_set) { res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res_cgroup = c; break; } } } BUG_ON(!res_cgroup); return res_cgroup; } /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy */ static struct cgroup * current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; struct css_set *cset; lockdep_assert_held(&css_set_lock); rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; res = __cset_cgroup_from_root(cset, root); rcu_read_unlock(); return res; } /* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously. */ static struct cgroup *current_cgns_cgroup_dfl(void) { struct css_set *cset; if (current->nsproxy) { cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root); } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_task_namespaces() and * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all * cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); } /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } /* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); } else { strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE) mode |= S_IWUGO; else mode |= S_IWUSR; } return mode; } /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; cur_ss_mask = new_ss_mask; } return cur_ss_mask; } /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); } /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ if (!cgroup_tryget(cgrp)) return NULL; kernfs_break_active_protection(kn); if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; cgroup_kn_unlock(kn); return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) return; css->flags &= ~CSS_VISIBLE; if (!css->ss) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); if (cgroup_psi_enabled()) cgroup_addrm_files(css, cgrp, cgroup_psi_files, false); } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, false); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } } /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css * * On failure, no file is added. */ static int css_populate_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(&cgrp->self, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(&cgrp->self, cgrp, cgroup_psi_files, true); if (ret < 0) return ret; } } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, true); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); if (ret < 0) { failed_cfts = cfts; goto err; } } } css->flags |= CSS_VISIBLE; return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { if (cfts == failed_cfts) break; cgroup_addrm_files(css, cgrp, cfts, false); } return ret; } int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; /* * Collect ssid's that need to be disabled from default * hierarchy. */ if (ss->root == &cgrp_dfl_root) dfl_disable_ss_mask |= 1 << ssid; } while_each_subsys_mask(); if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset, *cset_pos; struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { /* disable from the source */ src_root->subsys_mask &= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here. */ list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id]) it->cset_head = &dcgrp->e_csets[ss->id]; } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\n", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); struct cgroup *ns_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); len = 0; } kfree(buf); return len; } enum cgroup2_param { Opt_nsdelegate, Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), {} }; static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; } return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { if (root_flags & CGRP_ROOT_NS_DELEGATE) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; cgroup_favor_dynmods(&cgrp_dfl_root, root_flags & CGRP_ROOT_FAVOR_DYNMODS); if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; } } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); return 0; } static int cgroup_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); apply_cgroup_root_flags(ctx->flags); return 0; } static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; int ssid; INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_fs_context *ctx) { struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats; ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); ret = 0; goto out; exit_stats: cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; } int cgroup_do_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; ctx->kfc.root = ctx->root->kf_root; if (fc->fs_type == &cgroup2_fs_type) ctx->kfc.magic = CGROUP2_SUPER_MAGIC; else ctx->kfc.magic = CGROUP_SUPER_MAGIC; ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); if (IS_ERR(nsdentry)) { deactivate_locked_super(sb); ret = PTR_ERR(nsdentry); nsdentry = NULL; } fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) cgroup_put(&ctx->root->cgrp); return ret; } /* * Destroy a cgroup filesystem context. */ static void cgroup_fs_context_free(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); kfree(ctx->name); kfree(ctx->release_agent); put_cgroup_ns(ctx->ns); kernfs_free_fs_context(fc); kfree(ctx); } static int cgroup_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; ret = cgroup_do_get_tree(fc); if (!ret) apply_cgroup_root_flags(ctx->flags); return ret; } static const struct fs_context_operations cgroup_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup2_parse_param, .get_tree = cgroup_get_tree, .reconfigure = cgroup_reconfigure, }; static const struct fs_context_operations cgroup1_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup1_parse_param, .get_tree = cgroup1_get_tree, .reconfigure = cgroup1_reconfigure, }; /* * Initialise the cgroup filesystem creation/reconfiguration context. Notably, * we select the namespace we're going to use. */ static int cgroup_init_fs_context(struct fs_context *fc) { struct cgroup_fs_context *ctx; ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; if (fc->fs_type == &cgroup2_fs_type) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; #ifdef CONFIG_CGROUP_FAVOR_DYNMODS ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; #endif return 0; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { cgroup_bpf_offline(&root->cgrp); percpu_ref_kill(&root->cgrp.self.refcnt); } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { .name = "cgroup", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup1_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup2_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; #ifdef CONFIG_CPUSETS static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, }; /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ static int cpuset_init_fs_context(struct fs_context *fc) { char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); struct cgroup_fs_context *ctx; int err; err = cgroup_init_fs_context(fc); if (err) { kfree(agent); return err; } fc->ops = &cpuset_fs_context_ops; ctx = cgroup_fc2context(fc); ctx->subsys_mask = 1 << cpuset_cgrp_id; ctx->flags |= CGRP_ROOT_NOPREFIX; ctx->release_agent = agent; get_filesystem(&cgroup_fs_type); put_filesystem(fc->fs_type); fc->fs_type = &cgroup_fs_type; return 0; } static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, .fs_flags = FS_USERNS_MOUNT, }; #endif int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { int ret; cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() * implementations (e.g. cpuset), also need to disable CPU hotplug. * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can * lead to deadlocks. * * Bringing up a CPU may involve creating and destroying tasks which requires * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while * write-locking threadgroup_rwsem, the locking order is reversed and we end up * waiting for an on-going CPU hotplug operation which in turn is waiting for * the threadgroup_rwsem to be released to create new tasks. For more details: * * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu * * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ void cgroup_attach_lock(bool lock_threadgroup) { cpus_read_lock(); if (lock_threadgroup) percpu_down_write(&cgroup_threadgroup_rwsem); } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ void cgroup_attach_unlock(bool lock_threadgroup) { if (lock_threadgroup) percpu_up_write(&cgroup_threadgroup_rwsem); cpus_read_unlock(); } /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one. */ static void cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return; /* cgroup_threadgroup_rwsem protects racing against forks */ WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) return; mgctx->tset.nr_tasks++; list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, &mgctx->tset.dst_csets); } /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else task = list_next_entry(task, cg_list); if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; /* * This function may be called both before and * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ if (cset->mg_dst_cset) *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else *dst_cssp = cset->subsys[tset->ssid]; return task; } cset = list_next_entry(cset, mg_node); task = NULL; } return NULL; } /** * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; /* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); if (ret) { failed_ssid = ssid; goto out_cancel_attach; } } } while_each_subsys_mask(); } /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, * the task might require to change its state. */ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, to_cset->dfl_cgrp); put_css_set_locked(from_cset); } } spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete. */ tset->csets = &tset->dst_csets; if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); } } while_each_subsys_mask(); } ret = 0; goto out_release_tset; out_cancel_attach: if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { tset->ssid = ssid; ss->cancel_attach(tset); } } while_each_subsys_mask(); } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); /* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration. */ tset->nr_tasks = 0; tset->csets = &tset->src_csets; return ret; } /** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks. */ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0; /* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; /* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY; return 0; } /** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_src_preload_node); put_css_set_locked(cset); } list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } spin_unlock_irq(&css_set_lock); } /** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); /* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it. */ if (src_cset->dead) return; if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx. */ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); /* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both. */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; } src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_dst_preload_node)) list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); for_each_subsys(ss, ssid) if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) mgctx->ss_mask |= 1 << ssid; } return 0; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; /* * The following thread iteration should be inside an RCU critical * section to prevent tasks from being freed while taking the snapshot. * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); if (!ret) TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); /* * If we migrate a single thread, we don't care about threadgroup * stability. If the thread is `current`, it won't exit(2) under our * hands or change PID through exec(2). We exclude * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write * callers by cgroup_mutex. * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); *threadgroup_locked = pid || threadgroup; cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); goto out_unlock_threadgroup; } } else { tsk = current; } if (threadgroup) tsk = tsk->group_leader; /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup; } get_task_struct(tsk); goto out_unlock_rcu; out_unlock_threadgroup: cgroup_attach_unlock(*threadgroup_locked); *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; /* release reference from cgroup_procs_write_start() */ put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } /* show controllers which are enabled for a given cgroup's children */ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } /** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; /* * As cgroup_update_dfl_csses() is only called by * cgroup_apply_control(). The csses associated with the * given cgrp will not be affected by changes made to * its subtree_control file. We can skip them. */ if (dsct == cgrp) continue; list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* * We need to write-lock threadgroup_rwsem while migrating tasks. * However, if there are no source csets for @cgrp, changing its * controllers isn't gonna produce any task migrations and the * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(has_tasks); return ret; } /** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; restart: cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); cgroup_put(dsct); goto restart; } } } /** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_dom_cgrp = dsct->dom_cgrp; } } /** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled. */ static void cgroup_propagate_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct->subtree_control, cgroup_ss_mask(dsct)); } } /** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->dom_cgrp = dsct->old_dom_cgrp; } } static bool css_visible(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; if (cgroup_control(cgrp) & (1 << ss->id)) return true; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) return false; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; } /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; if (!css) { css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css); } WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; } } } return 0; } /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose. */ static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!css) continue; WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } } } } /** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations. */ static int cgroup_apply_control(struct cgroup *cgrp) { int ret; cgroup_propagate_control(cgrp); ret = cgroup_apply_control_enable(cgrp); if (ret) return ret; /* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ return cgroup_update_dfl_csses(cgrp); } /** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info. */ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) { if (ret) { cgroup_restore_control(cgrp); cgroup_propagate_control(cgrp); } cgroup_apply_control_disable(cgrp); } static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) { u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0; /* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0; if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP; } else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree. */ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0; } /* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks. */ if (cgroup_has_tasks(cgrp)) return -EBUSY; return 0; } /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { enable |= 1 << ssid; disable &= ~(1 << ssid); } else if (*tok == '-') { disable |= 1 << ssid; enable &= ~(1 << ssid); } else { return -EINVAL; } break; } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } } } } if (!enable && !disable) { ret = 0; goto out_unlock; } ret = cgroup_vet_subtree_control_enable(cgrp, enable); if (ret) goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock; kernfs_activate(cgrp->kn); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it. */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); /* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0; /* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly. */ if (cgroup_is_populated(cgrp) || cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP; /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; /* * The following shouldn't cause actual migrations and should * always succeed. */ cgroup_save_control(cgrp); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) if (dsct == cgrp || cgroup_is_threaded(dsct)) dsct->dom_cgrp = dom_cgrp; ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; cgroup_finalize_control(cgrp, ret); return ret; } static int cgroup_type_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; if (cgroup_is_threaded(cgrp)) seq_puts(seq, "threaded\n"); else if (!cgroup_is_valid_domain(cgrp)) seq_puts(seq, "domain invalid\n"); else if (cgroup_is_thread_root(cgrp)) seq_puts(seq, "domain threaded\n"); else seq_puts(seq, "domain\n"); return 0; } static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int ret; /* only switching to threaded mode is supported */ if (strcmp(strstrip(buf), "threaded")) return -EINVAL; /* drain dying csses before we re-apply (threaded) subtree control */ cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; /* threaded can only be enabled */ ret = cgroup_enable_threaded(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_max_descendants_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int descendants = READ_ONCE(cgrp->max_descendants); if (descendants == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", descendants); return 0; } static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int descendants; ssize_t ret; buf = strstrip(buf); if (!strcmp(buf, "max")) { descendants = INT_MAX; } else { ret = kstrtoint(buf, 0, &descendants); if (ret) return ret; } if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_descendants = descendants; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_max_depth_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int depth = READ_ONCE(cgrp->max_depth); if (depth == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", depth); return 0; } static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int depth; buf = strstrip(buf); if (!strcmp(buf, "max")) { depth = INT_MAX; } else { ret = kstrtoint(buf, 0, &depth); if (ret) return ret; } if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_depth = depth; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_events_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); return 0; } static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); return 0; } #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); css = cgroup_css(cgrp, ss); if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); return css; } static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_extra_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_extra_stat_show(seq, css); css_put(css); return ret; } static int cgroup_local_stat_show(struct seq_file *seq, struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_local_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_local_stat_show(seq, css); css_put(css); return ret; } #endif static int cpu_stat_show(struct seq_file *seq, void *v) { int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } static int cpu_local_stat_show(struct seq_file *seq, void *v) { struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; #ifdef CONFIG_CGROUP_SCHED ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; cgroup_get(cgrp); cgroup_kn_unlock(of->kn); /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { cgroup_put(cgrp); return -EBUSY; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; } static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_CPU); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IRQ); } static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IRQ); } #endif static int cgroup_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); seq_printf(seq, "%d\n", psi->enabled); return 0; } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret; int enable; struct cgroup *cgrp; struct psi_group *psi; ret = kstrtoint(strstrip(buf), 0, &enable); if (ret) return ret; if (enable < 0 || enable > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; psi = cgroup_psi(cgrp); if (psi->enabled != enable) { int i; /* show or hide {cpu,memory,io,irq}.pressure files */ for (i = 0; i < NR_PSI_RESOURCES; i++) cgroup_file_show(&cgrp->psi_files[i], enable); psi->enabled = enable; if (enable) psi_cgroup_restart(psi); } cgroup_kn_unlock(of->kn); return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { struct cgroup_file_ctx *ctx = of->priv; return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static int cgroup_pressure_open(struct kernfs_open_file *of) { if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) return -EPERM; return 0; } static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) { if (static_branch_likely(&psi_disabled)) return false; return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { return false; } #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "%d\n", cgrp->freezer.freeze); return 0; } static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int freeze; ret = kstrtoint(strstrip(buf), 0, &freeze); if (ret) return ret; if (freeze < 0 || freeze > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgroup_freeze(cgrp, freeze); cgroup_kn_unlock(of->kn); return nbytes; } static void __cgroup_kill(struct cgroup *cgrp) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); set_bit(CGRP_KILL, &cgrp->flags); spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); while ((task = css_task_iter_next(&it))) { /* Ignore kernel threads here. */ if (task->flags & PF_KTHREAD) continue; /* Skip tasks that are already dying. */ if (__fatal_signal_pending(task)) continue; send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); spin_lock_irq(&css_set_lock); clear_bit(CGRP_KILL, &cgrp->flags); spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) { struct cgroup_subsys_state *css; struct cgroup *dsct; lockdep_assert_held(&cgroup_mutex); cgroup_for_each_live_descendant_pre(dsct, css, cgrp) __cgroup_kill(dsct); } static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret = 0; int kill; struct cgroup *cgrp; ret = kstrtoint(strstrip(buf), 0, &kill); if (ret) return ret; if (kill != 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; /* * Killing is a process directed operation, i.e. the whole thread-group * is taken down so act like we do for cgroup.procs and only make this * writable in non-threaded cgroups. */ if (cgroup_is_threaded(cgrp)) ret = -EOPNOTSUPP; else cgroup_kill(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); if (ret) { put_cgroup_ns(ctx->ns); kfree(ctx); } return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; if (!nbytes) return 0; /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * cgroup.procs and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) return cft->write(of, buf, nbytes, off); /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css(). */ rcu_read_lock(); css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) ret = cft->write_u64(css, cft, v); } else if (cft->write_s64) { long long v; ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); } else { ret = -EINVAL; } return ret ?: nbytes; } static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); return kernfs_generic_poll(of, pt); } static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { if (seq_cft(seq)->seq_stop) seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); if (cft->seq_show) return cft->seq_show(m, arg); if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else return -EINVAL; return 0; } static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, }; /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = current_fsuid(), .ia_gid = current_fsgid(), }; if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) return 0; return kernfs_setattr(kn, &iattr); } static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, notify_timer)); } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); if (ret) { kernfs_remove(kn); return ret; } if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); } return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; lockdep_assert_held(&cgroup_mutex); restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); cft_end = cft; is_add = false; goto restart; } } else { cgroup_rm_file(cgrp, cft); } } return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } if (is_add && !ret) kernfs_activate(root->kn); return ret; } static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { /* free copy for custom atomic_write_len, see init_cftypes() */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); if (cft->flags & __CFTYPE_ADDED) { ret = -EBUSY; break; } if (cft->seq_start) kf_ops = &cgroup_kf_ops; else kf_ops = &cgroup_kf_single_ops; /* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len. */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { ret = -ENOMEM; break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; cft->flags |= __CFTYPE_ADDED; } if (ret) cgroup_exit_cftypes(cfts); return ret; } static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); } /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered. */ int cgroup_rm_cftypes(struct cftype *cfts) { if (!cfts || cfts[0].name[0] == '\0') return 0; if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT; cgroup_lock(); cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return 0; } /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') return 0; ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return ret; } /** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy. */ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_ONLY_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies. */ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset. */ void cgroup_file_notify(struct cgroup_file *cfile) { unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsigned long last = cfile->notified_at; unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; if (time_in_range(jiffies, last, next)) { timer_reduce(&cfile->notify_timer, next); } else { kernfs_notify(cfile->kn); cfile->notified_at = jiffies; } } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset * @show: whether to show or hide */ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; spin_lock_irq(&cgroup_file_kn_lock); kn = cfile->kn; kernfs_get(kn); spin_unlock_irq(&cgroup_file_kn_lock); if (kn) kernfs_show(kn, show); kernfs_put(kn); } /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk * * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is * that @parent and @pos are accessible. The next sibling is guaranteed to * be returned regardless of their states. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been unlinked from the sibling list. * Once a cgroup is removed, its ->sibling.next is no longer * updated when its next sibling changes. CSS_RELEASED is set when * @pos is taken off list, at which time its next pointer is valid, * and, as releases are serialized, the one pointed to by the next * pointer is guaranteed to not have started release yet. This * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically * increasing unique serial number and always appended to the * sibling list, the next one can be found by walking the parent's * children until the first css with higher serial number than * @pos's. While this path can be slower, it happens iff iteration * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { list_for_each_entry_rcu(next, &parent->children, sibling, lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is * the next sibling. */ if (&next->sibling != &parent->children) return next; return NULL; } /** * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { next = css_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; } return NULL; } EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css * @pos: css of interest * * Return the rightmost descendant of @pos. If there's no descendant, @pos * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct rightmost descendant as * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; cgroup_assert_mutex_or_rcu_locked(); do { last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last; do { last = pos; pos = css_next_child(NULL, pos); } while (pos); return last; } /** * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ return pos->parent; } /** * css_has_online_children - does a css have online children * @css: the target css * * Returns %true if @css has any online children; otherwise, %false. This * function can be called from any context but the caller is responsible * for synchronizing against on/offlining as necessary. */ bool css_has_online_children(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *child; bool ret = false; rcu_read_lock(); css_for_each_child(child, css) { if (child->flags & CSS_ONLINE) { ret = true; break; } } rcu_read_unlock(); return ret; } static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) { struct list_head *l; struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* find the next threaded cset */ if (it->tcset_pos) { l = it->tcset_pos->next; if (l != it->tcset_head) { it->tcset_pos = l; return container_of(l, struct css_set, threaded_csets_node); } it->tcset_pos = NULL; } /* find the next cset */ l = it->cset_pos; l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; return NULL; } if (it->ss) { cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); } else { link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } it->cset_pos = l; /* initialize threaded css_set walking */ if (it->flags & CSS_TASK_ITER_THREADED) { if (it->cur_dcset) put_css_set_locked(it->cur_dcset); it->cur_dcset = cset; get_css_set(cset); it->tcset_head = &cset->threaded_csets; it->tcset_pos = &cset->threaded_csets; } return cset; } /** * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set and find first non-empty tasks list*/ while ((cset = css_task_iter_next_css_set(it))) { if (!list_empty(&cset->tasks)) { it->cur_tasks_head = &cset->tasks; break; } else if (!list_empty(&cset->mg_tasks)) { it->cur_tasks_head = &cset->mg_tasks; break; } else if (!list_empty(&cset->dying_tasks)) { it->cur_tasks_head = &cset->dying_tasks; break; } } if (!cset) { it->task_pos = NULL; return; } it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus * need to take steps to ensure that iteration can be resumed after * the lock is re-acquired. Iteration is performed at two levels - * css_sets and tasks in them. * * Once created, a css_set never leaves its cgroup lists, so a * pinned css_set is guaranteed to stay put and we can resume * iteration afterwards. * * Tasks may leave @cset across iteration steps. This is resolved * by registering each iterator with the css_set currently being * walked and making css_set_move_task() advance iterators whose * next task is leaving. */ if (it->cur_cset) { list_del(&it->iters_node); put_css_set_locked(it->cur_cset); } get_css_set(cset); it->cur_cset = cset; list_add(&it->iters_node, &cset->task_iters); } static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task) { lockdep_assert_held(&css_set_lock); if (it->task_pos == &task->cg_list) { it->task_pos = it->task_pos->next; it->flags |= CSS_TASK_ITER_SKIPPED; } } static void css_task_iter_advance(struct css_task_iter *it) { struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { /* * Advance iterator to find next entry. We go through cset * tasks, mg_tasks and dying_tasks, when consumed we move onto * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; if (it->task_pos == &it->cur_cset->tasks) { it->cur_tasks_head = &it->cur_cset->mg_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->mg_tasks) { it->cur_tasks_head = &it->cur_cset->dying_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } if (!it->task_pos) return; task = list_entry(it->task_pos, struct task_struct, cg_list); if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ if (!thread_group_leader(task)) goto repeat; /* and dying leaders w/o live member threads */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); it->ss = css->ss; it->flags = flags; if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; it->cset_head = it->cset_pos; css_task_iter_advance(it); spin_unlock_irq(&css_set_lock); } /** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via css_task_iter_start(). Returns NULL when the iteration * reaches the end. */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } spin_lock_irq(&css_set_lock); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) css_task_iter_advance(it); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); get_task_struct(it->cur_task); css_task_iter_advance(it); } spin_unlock_irq(&css_set_lock); return it->cur_task; } /** * css_task_iter_end - finish task iteration * @it: the task iterator to finish * * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); spin_unlock_irq(&css_set_lock); } if (it->cur_dcset) put_css_set(it->cur_dcset); if (it->cur_task) put_task_struct(it->cur_task); } static void cgroup_procs_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; if (ctx->procs.started) css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_file_ctx *ctx = of->priv; struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); css_task_iter_start(&cgrp->self, iter_flags, it); ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); } else return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { struct cgroup *cgrp = seq_css(s)->cgroup; /* * All processes of a threaded subtree belong to the domain cgroup * of the subtree. Only threads can be distributed across the * subtree. Reject reads on cgroup.procs in the subtree proper. * They're always empty anyway. */ if (cgroup_is_threaded(cgrp)) return ERR_PTR(-EOPNOTSUPP); return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED); } static int cgroup_procs_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) { int ret; struct inode *inode; lockdep_assert_held(&cgroup_mutex); inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (!inode) return -ENOMEM; ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, struct cgroup_namespace *ns) { struct cgroup *com_cgrp = src_cgrp; int ret; lockdep_assert_held(&cgroup_mutex); /* find the common ancestor */ while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; /* * If namespaces are delegation boundaries, %current must be able * to see both source and destination cgroups from its namespace. */ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) return -ENOENT; return 0; } static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, bool threadgroup, struct cgroup_namespace *ns) { int ret = 0; ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; ret = cgroup_migrate_vet_dst(dst_cgrp); if (ret) return ret; if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) ret = -EOPNOTSUPP; return ret; } static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; ssize_t ret; bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* find the source cgroup */ spin_lock_irq(&css_set_lock); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); /* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks. */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); return ret; } static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) { return __cgroup_procs_start(s, pos, 0); } static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { .name = "cgroup.type", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_type_show, .write = cgroup_type_write, }, { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { .name = "cgroup.threads", .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_threads_write, }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, { .name = "cgroup.max.descendants", .seq_show = cgroup_max_descendants_show, .write = cgroup_max_descendants_write, }, { .name = "cgroup.max.depth", .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, .write = cgroup_freeze_write, }, { .name = "cgroup.kill", .flags = CFTYPE_NOT_ON_ROOT, .write = cgroup_kill_write, }, { .name = "cpu.stat", .seq_show = cpu_stat_show, }, { .name = "cpu.stat.local", .seq_show = cpu_local_stat_show, }, { } /* terminate */ }; static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #ifdef CONFIG_IRQ_TIME_ACCOUNTING { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #endif { .name = "cgroup.pressure", .seq_show = cgroup_pressure_show, .write = cgroup_pressure_write, }, #endif /* CONFIG_PSI */ { } /* terminate */ }; /* * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be * offlined by invoking offline_css(). After offlining, the base ref is * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ static void css_free_rwork_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); if (ss) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); if (parent) css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* * This is root cgroup's refcnt reaching zero, * which indicates that the root should be * released. */ cgroup_destroy_root(cgrp->root); } } } static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); list_del_rcu(&css->rstat_css_node); } cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - * cgroupstats_build() and css_tryget_online_from_dir(). * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); if (css->parent) atomic_inc(&css->parent->online_cnt); } return ret; } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) return; if (ss->css_offline) ss->css_offline(css); css->flags &= ~CSS_ONLINE; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); } /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css * * Create a new css associated with @cgrp - @ss pair. On success, the new * css is online and installed in @cgrp. This function doesn't create the * interface files. Returns 0 on success, -errno on failure. */ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); if (!css) css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_css; css->id = err; /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) goto err_list_del; return css; err_list_del: list_del_rcu(&css->sibling); err_free_css: list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } /* * The returned cgroup is fully initialized including its control mask, but * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; int level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_stat_exit; } cgrp->kn = kn; init_cgroup_housekeeping(cgrp); cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; ret = psi_cgroup_alloc(cgrp); if (ret) goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) goto out_psi_free; /* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be * attached to the child cgroup, it will become frozen. * At this point the new cgroup is unpopulated, so we can * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); set_bit(CGRP_FROZEN, &cgrp->flags); } spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestors[tcgrp->level] = tcgrp; if (tcgrp != cgrp) { tcgrp->nr_descendants++; /* * If the new cgroup is frozen, all ancestor cgroups * get a new frozen descendant, but their state can't * change because of this. */ if (cgrp->freezer.e_freeze) tcgrp->freezer.nr_frozen_descendants++; } } spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); cgroup_propagate_control(cgrp); return cgrp; out_psi_free: psi_cgroup_free(cgrp); out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); return ERR_PTR(ret); } static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; int level = 1; lockdep_assert_held(&cgroup_mutex); for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; if (level > cgroup->max_depth) goto fail; level++; } ret = true; fail: return ret; } int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(name, '\n')) return -EINVAL; parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; if (!cgroup_check_hierarchy_limits(parent)) { ret = -EAGAIN; goto out_unlock; } cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; out_destroy: cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; } /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_lock(); do { offline_css(css); css_put(css); /* @css can't go away while we're holding cgroup_mutex */ css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } } /** * kill_css - destroy a css * @css: css to destroy * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked * asynchronously once css_tryget_online() is guaranteed to fail and when * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) return; css->flags |= CSS_DYING; /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). */ css_get(css); /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to * guarantee that css_tryget_online() won't succeed by the time * ->css_offline() is invoked. To satisfy all the requirements, * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of * css's. Set up so that the next stage will be kicked off once all * the percpu refcnts are confirmed to be killed. * * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the * rest of destruction. Once all cgroup references are gone, the * cgroup is RCU-freed. * * This function implements s1. After this step, @cgrp is gone as far as * the userland is concerned and a new cgroup with the same name may be * created. As cgroup doesn't care about the names internally, this * doesn't cause any problem. */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); /* * Only migration can raise populated from zero and we're already * holding cgroup_mutex. */ if (cgroup_is_populated(cgrp)) return -EBUSY; /* * Make sure there's no live children. We can't test emptiness of * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ if (css_has_online_children(&cgrp->self)) return -EBUSY; /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* * If the dying cgroup is frozen, decrease frozen descendants * counters of ancestor cgroups. */ if (test_bit(CGRP_FROZEN, &cgrp->flags)) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); cgroup_bpf_offline(cgrp); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); return 0; }; int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; ret = cgroup_destroy_locked(cgrp); if (!ret) TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; pr_debug("Initializing cgroup subsys %s\n", ss->name); cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* * Root csses are never destroyed and we can't initialize * percpu_ref during early init. Disable refcnting. */ css->flags |= CSS_NO_REF; if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); BUG_ON(online_css(css)); cgroup_unlock(); } /** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init. */ int __init cgroup_init_early(void) { static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; ctx.root = &cgrp_dfl_root; init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); ss->id = i; ss->name = cgroup_subsys_name[i]; if (!ss->legacy_name) ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); } return 0; } /** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init. */ int __init cgroup_init(void) { struct cgroup_subsys *ss; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); cgroup_rstat_boot(); get_user_ns(init_cgroup_ns.user_ns); cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to * it during init. */ hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } else { cgroup_init_subsys(ss, false); } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); /* * Setting dfl_root subsys_mask needs to consider the * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ if (!cgroup_ssid_enabled(ssid)) continue; if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; /* implicit controllers must be threaded too */ WARN_ON(ss->implicit_on_dfl && !ss->threaded); if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->threaded) cgrp_dfl_threaded_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } if (ss->bind) ss->bind(init_css_set.subsys[ssid]); cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ hash_del(&init_css_set.hlist); hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); #ifdef CONFIG_CPUSETS WARN_ON(register_filesystem(&cpuset_fs_type)); #endif return 0; } static int __init cgroup_wq_init(void) { /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); return 0; } core_initcall(cgroup_wq_init); void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); kernfs_put(kn); } /* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * Only cgroups within current task's cgroup NS are valid. */ struct cgroup *cgroup_get_from_id(u64 id) { struct kernfs_node *kn; struct cgroup *cgrp, *root_cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return ERR_PTR(-ENOENT); if (kernfs_type(kn) != KERNFS_DIR) { kernfs_put(kn); return ERR_PTR(-ENOENT); } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; rcu_read_unlock(); kernfs_put(kn); if (!cgrp) return ERR_PTR(-ENOENT); root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); return ERR_PTR(-ENOENT); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { char *buf; int retval; struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; cgroup_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, * while a zombie doesn't show up in "cgroup.procs" and * thus can't be migrated, its /proc/PID/cgroup keeps * reporting the cgroup it belonged to before exiting. If * the cgroup is removed before the zombie is reaped, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; seq_puts(m, buf); } else { seq_puts(m, "/"); } if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else seq_putc(m, '\n'); } retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); cgroup_unlock(); kfree(buf); out: return retval; } /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } /** * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer * @f: file corresponding to cgroup_dir * * Find the cgroup from a file pointer associated with a cgroup directory. * Returns a pointer to the cgroup on success. ERR_PTR is returned if the * cgroup cannot be found. */ static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); return css->cgroup; } /** * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports * cgroup2. * @f: file corresponding to cgroup2_dir */ static struct cgroup *cgroup_get_from_file(struct file *f) { struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } /** * cgroup_css_set_fork - find or create a css_set for a child process * @kargs: the arguments passed to create the child process * * This functions finds or creates a new css_set which the child * process will be attached to in cgroup_post_fork(). By default, * the child process will be given the same css_set as its parent. * * If CLONE_INTO_CGROUP is specified this function will try to find an * existing css_set which includes the requested cgroup and if not create * a new css_set that the child will be attached to later. If this function * succeeds it will hold cgroup_threadgroup_rwsem on return. If * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex * before grabbing cgroup_threadgroup_rwsem and will hold a reference * to the target cgroup. */ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) { int ret; struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); cgroup_threadgroup_change_begin(current); spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { kargs->cset = cset; return 0; } f = fget_raw(kargs->cgroup); if (!f) { ret = -EBADF; goto err; } sb = f->f_path.dentry->d_sb; dst_cgrp = cgroup_get_from_file(f); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; goto err; } if (cgroup_is_dead(dst_cgrp)) { ret = -ENODEV; goto err; } /* * Verify that we the target cgroup is writable for us. This is * usually done by the vfs layer but since we're not going through * the vfs layer here we need to do it "manually". */ ret = cgroup_may_write(dst_cgrp, sb); if (ret) goto err; /* * Spawning a task directly into a cgroup works by passing a file * descriptor to the target cgroup directory. This can even be an O_PATH * file descriptor. But it can never be a cgroup.procs file descriptor. * This was done on purpose so spawning into a cgroup could be * conceptualized as an atomic * * fd = openat(dfd_cgroup, "cgroup.procs", ...); * write(fd, <child-pid>, ...); * * sequence, i.e. it's a shorthand for the caller opening and writing * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us * to always use the caller's credentials. */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); if (ret) goto err; kargs->cset = find_css_set(cset, dst_cgrp); if (!kargs->cset) { ret = -ENOMEM; goto err; } put_css_set(cset); fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); if (f) fput(f); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); if (kargs->cset) put_css_set(kargs->cset); return ret; } /** * cgroup_css_set_put_fork - drop references we took during fork * @kargs: the arguments passed to create the child process * * Drop references to the prepared css_set and target cgroup if * CLONE_INTO_CGROUP was requested. */ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset; cgroup_threadgroup_change_end(current); if (cset) { put_css_set(cset); kargs->cset = NULL; } if (kargs->flags & CLONE_INTO_CGROUP) { cgroup_unlock(); if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; } } } /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks. */ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; ret = cgroup_css_set_fork(kargs); if (ret) return ret; do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); return 0; out_revert: for_each_subsys(ss, j) { if (j >= i) break; if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); } cgroup_css_set_put_fork(kargs); return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * @child: the child process * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); cgroup_css_set_put_fork(kargs); } /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. */ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; cset = kargs->cset; kargs->cset = NULL; spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { if (kargs->cgrp) cgrp_flags = kargs->cgrp->flags; else cgrp_flags = cset->dfl_cgrp->flags; WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } else { put_css_set(cset); cset = NULL; } if (!(child->flags & PF_KTHREAD)) { if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { /* * If the cgroup has to be frozen, the new task has * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to * get the task into the frozen state. */ spin_lock(&child->sighand->siglock); WARN_ON_ONCE(child->frozen); child->jobctl |= JOBCTL_TRAP_FREEZE; spin_unlock(&child->sighand->siglock); /* * Calling cgroup_update_frozen() isn't required here, * because it will be called anyway a bit later from * do_freezer_trap(). So we avoid cgroup's transient * switch from the frozen state and back. */ } /* * If the cgroup is to be killed notice it now and take the * child down right after we finished preparing it for * userspace. */ kill = test_bit(CGRP_KILL, &cgrp_flags); } spin_unlock_irq(&css_set_lock); /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); /* Make the new cset the root_cset of the new cgroup namespace. */ if (kargs->flags & CLONE_NEWCGROUP) { struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; get_css_set(cset); child->nsproxy->cgroup_ns->root_cset = cset; put_css_set(rcset); } /* Cgroup has to be killed so take down child immediately. */ if (unlikely(kill)) do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); cgroup_css_set_put_fork(kargs); } /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; int i; spin_lock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; if (dl_task(tsk)) dec_dl_tasks_cs(tsk); WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); } while_each_subsys_mask(); } void cgroup_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); spin_lock_irq(&css_set_lock); css_set_skip_task_iters(task_css_set(task), task); list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); } static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; static_branch_disable(cgroup_subsys_enabled_key[i]); pr_info("Disabling %s control group subsystem\n", ss->name); } for (i = 0; i < OPT_FEATURE_COUNT; i++) { if (strcmp(token, cgroup_opt_feature_names[i])) continue; cgroup_feature_disable_mask |= 1 << i; pr_info("Disabling %s control group feature\n", cgroup_opt_feature_names[i]); break; } } return 1; } __setup("cgroup_disable=", cgroup_disable); void __init __weak enable_debug_cgroup(void) { } static int __init enable_cgroup_debug(char *str) { cgroup_debug = true; enable_debug_cgroup(); return 1; } __setup("cgroup_debug", enable_cgroup_debug); /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * * If @dentry is a directory for a cgroup which has @ss enabled on it, try * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); return css; } /** * css_from_id - lookup css by id * @id: the cgroup id * @ss: cgroup subsys to be looked into * * Returns the css if there's valid one with @id, otherwise returns NULL. * Should be called under rcu_read_lock(). */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&ss->css_idr, id); } /** * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path * @path: path on the default hierarchy * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp; root_cgrp = current_cgns_cgroup_dfl(); kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; if (kernfs_type(kn) != KERNFS_DIR) { cgrp = ERR_PTR(-ENOTDIR); goto out_kernfs; } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); rcu_read_unlock(); out_kernfs: kernfs_put(kn); out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct fd f = fdget_raw(fd); if (!f.file) return ERR_PTR(-EBADF); cgrp = cgroup_v1v2_get_from_file(f.file); fdput(f); return cgrp; } /** * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports * cgroup2. * @fd: fd obtained by open(cgroup2_dir) */ struct cgroup *cgroup_get_from_fd(int fd) { struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) { u64 v = 1; while (power--) v *= 10; return v; } /** * cgroup_parse_float - parse a floating number * @input: input string * @dec_shift: number of decimal digits to shift * @v: output * * Parse a decimal floating point number in @input and store the result in * @v with decimal point right shifted @dec_shift times. For example, if * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. * Returns 0 on success, -errno otherwise. * * There's nothing cgroup specific about this function except that it's * currently the only user. */ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) { s64 whole, frac = 0; int fstart = 0, fend = 0, flen; if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) return -EINVAL; if (frac < 0) return -EINVAL; flen = fend > fstart ? fend - fstart : 0; if (flen < dec_shift) frac *= power_of_ten(dec_shift - flen); else frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); *v = whole * power_of_ten(dec_shift) + frac; return 0; } /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { struct cgroup *cgroup; rcu_read_lock(); /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) { cgroup = &cgrp_dfl_root.cgrp; cgroup_get(cgroup); goto out; } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { cgroup = cset->dfl_cgrp; break; } cpu_relax(); } out: skcd->cgroup = cgroup; cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); /* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd. * Don't use cgroup_get_live(). */ cgroup_get(cgrp); cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); cgroup_bpf_put(cgrp); cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) { struct cftype *cft; ssize_t ret = 0; for (cft = files; cft && cft->name[0] != '\0'; cft++) { if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); if (WARN_ON(ret >= size)) break; } return ret; } static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct cgroup_subsys *ss; int ssid; ssize_t ret = 0; ret = show_delegatable_files(cgroup_base_files, buf + ret, PAGE_SIZE - ret, NULL); if (cgroup_psi_enabled()) ret += show_delegatable_files(cgroup_psi_files, buf + ret, PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, PAGE_SIZE - ret, cgroup_subsys_name[ssid]); return ret; } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, &cgroup_features_attr.attr, NULL, }; static const struct attribute_group cgroup_sysfs_attr_group = { .attrs = cgroup_sysfs_attrs, .name = "cgroup", }; static int __init cgroup_sysfs_init(void) { return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); #endif /* CONFIG_SYSFS */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* vcan.c - Virtual CAN interface * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/can-ml.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vcan" MODULE_DESCRIPTION("virtual CAN interface"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); /* CAN test feature: * Enable the echo on driver level for testing the CAN core echo modes. * See Documentation/networking/can.rst for details. */ static bool echo; /* echo testing. Default: 0 (Off) */ module_param(echo, bool, 0444); MODULE_PARM_DESC(echo, "Echo sent frames (for testing). Default: 0 (Off)"); static void vcan_rx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; stats->rx_packets++; stats->rx_bytes += can_skb_get_data_len(skb); skb->pkt_type = PACKET_BROADCAST; skb->dev = dev; skb->ip_summed = CHECKSUM_UNNECESSARY; netif_rx(skb); } static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len; int loop; if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; len = can_skb_get_data_len(skb); stats->tx_packets++; stats->tx_bytes += len; /* set flag whether this packet has to be looped back */ loop = skb->pkt_type == PACKET_LOOPBACK; skb_tx_timestamp(skb); if (!echo) { /* no echo handling available inside this driver */ if (loop) { /* only count the packets here, because the * CAN core already did the echo for us */ stats->rx_packets++; stats->rx_bytes += len; } consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ vcan_rx(skb, dev); } else { /* no looped packets => no counting */ consume_skb(skb); } return NETDEV_TX_OK; } static int vcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; dev->mtu = new_mtu; return 0; } static const struct net_device_ops vcan_netdev_ops = { .ndo_start_xmit = vcan_tx, .ndo_change_mtu = vcan_change_mtu, }; static const struct ethtool_ops vcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vcan_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; can_set_ml_priv(dev, netdev_priv(dev)); /* set flags according to driver capabilities */ if (echo) dev->flags |= IFF_ECHO; dev->netdev_ops = &vcan_netdev_ops; dev->ethtool_ops = &vcan_ethtool_ops; dev->needs_free_netdev = true; } static struct rtnl_link_ops vcan_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct can_ml_priv), .setup = vcan_setup, }; static __init int vcan_init_module(void) { pr_info("Virtual CAN interface driver\n"); if (echo) pr_info("enabled echo on driver level.\n"); return rtnl_link_register(&vcan_link_ops); } static __exit void vcan_cleanup_module(void) { rtnl_link_unregister(&vcan_link_ops); } module_init(vcan_init_module); module_exit(vcan_cleanup_module);
132 132 132 132 132 142 142 141 142 143 142 132 132 132 132 132 132 132 132 132 143 132 132 132 132 132 143 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 // SPDX-License-Identifier: GPL-2.0 /* * FPU signal frame handling routines. */ #include <linux/compat.h> #include <linux/cpu.h> #include <linux/pagemap.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/sigframe.h> #include <asm/trapnr.h> #include <asm/trace/fpu.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); void __user *fpstate = fxbuf; unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) goto setfx; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) return true; setfx: trace_x86_fpu_xstate_check_failed(&current->thread.fpu); /* Set the parameters for fx only state */ fx_sw->magic1 = 0; fx_sw->xstate_size = sizeof(struct fxregs_state); fx_sw->xfeatures = XFEATURE_MASK_FPSSE; return true; } /* * Signal frame handlers. */ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) return false; } else { struct fregs_state __user *fp = buf; u32 swd; if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) return false; } return true; } /* * Prepare the SW reserved portion of the fxsave memory layout, indicating * the presence of the extended state information in the memory layout * pointed to by the fpstate pointer in the sigcontext. * This is saved when ever the FP and extended state context is * saved on the user stack during the signal handler delivery to the user. */ static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, struct fpstate *fpstate) { sw_bytes->magic1 = FP_XSTATE_MAGIC1; sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; sw_bytes->xfeatures = fpstate->user_xfeatures; sw_bytes->xstate_size = fpstate->user_size; if (ia32_frame) sw_bytes->extended_size += sizeof(struct fregs_state); } static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, struct fpstate *fpstate) { struct xregs_state __user *x = buf; struct _fpx_sw_bytes sw_bytes = {}; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ save_sw_bytes(&sw_bytes, ia32_frame, fpstate); err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) return !err; err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will * enable us capturing any changes(during sigreturn) to * the FP/SSE bits by the legacy applications which don't touch * xfeatures in the xsave header. * * xsave aware apps can change the xfeatures in the xsave * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ xfeatures |= XFEATURE_MASK_FPSSE; err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { if (use_xsave()) return xsave_to_user_sigframe(buf); if (use_fxsr()) return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* * Save the fpu, extended register state to the user signal frame. * * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save * state is copied. * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. * * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * * Save it directly to the user frame with disabled page fault handler. If * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. * * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; struct fpstate *fpstate = tsk->thread.fpu.fpstate; bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); if (!static_cpu_has(X86_FEATURE_FPU)) { struct user_i387_ia32_struct fp; fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, .left = sizeof(fp)}); return !copy_to_user(buf, &fp, sizeof(fp)); } if (!access_ok(buf, size)) return false; if (use_xsave()) { struct xregs_state __user *xbuf = buf_fx; /* * Clear the xsave header first, so that reserved fields are * initialized to zero. */ if (__clear_user(&xbuf->header, sizeof(xbuf->header))) return false; } retry: /* * Load the FPU registers if they are not valid for the current task. * With a valid FPU state we can attempt to save the state directly to * userland's stack frame which will likely succeed. If it does not, * resolve the fault in the user memory and try again. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); pagefault_disable(); ret = copy_fpregs_to_sigframe(buf_fx); pagefault_enable(); fpregs_unlock(); if (ret) { if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; return false; } /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) return false; if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) return false; return true; } static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, u64 xrestore, bool fx_only) { if (use_xsave()) { u64 init_bv = ufeatures & ~xrestore; int ret; if (likely(!fx_only)) ret = xrstor_from_user_sigframe(buf, xrestore); else ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) os_xrstor(&init_fpstate, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); } else { return frstor_from_user_sigframe(buf); } } /* * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only, unsigned int size) { struct fpu *fpu = &current->thread.fpu; int ret; retry: fpregs_lock(); /* Ensure that XFD is up to date */ xfd_update_state(fpu->fpstate); pagefault_disable(); ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, xrestore, fx_only); pagefault_enable(); if (unlikely(ret)) { /* * The above did an FPU restore operation, restricted to * the user portion of the registers, and failed, but the * microcode might have modified the FPU registers * nevertheless. * * If the FPU registers do not belong to current, then * invalidate the FPU register state otherwise the task * might preempt current and return to user space with * corrupted FPU registers. */ if (test_thread_flag(TIF_NEED_FPU_LOAD)) __cpu_invalidate_fpregs_state(); fpregs_unlock(); /* Try to handle #PF, but anything else is fatal. */ if (ret != X86_TRAP_PF) return false; if (!fault_in_readable(buf, size)) goto retry; return false; } /* * Restore supervisor states: previous context switch etc has done * XSAVES and saved the supervisor states in the kernel buffer from * which they can be restored now. * * It would be optimal to handle this with a single XRSTORS, but * this does not work because the rest of the FPU registers have * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) os_xrstor_supervisor(fpu->fpstate); fpregs_mark_activate(); fpregs_unlock(); return true; } static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; unsigned int state_size; u64 user_xfeatures = 0; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) return false; fx_only = !fx_sw_user.magic1; state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { /* Restore the FPU registers directly from user memory. */ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, state_size); } /* * Copy the legacy state because the FP portion of the FX frame has * to be ignored for histerical raisins. The legacy state is folded * in once the larger state has been copied. */ if (__copy_from_user(&env, buf, sizeof(env))) return false; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is * not modified on context switch and that the xstate is considered * to be loaded again on return to userland (overriding last_cpu avoids * the optimisation). */ fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { /* * If supervisor states are available then save the * hardware state in current's fpstate so that the * supervisor state is preserved. Save the full state for * simplicity. There is no point in optimizing this by only * saving the supervisor states and then shuffle them to * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) os_xsave(fpu->fpstate); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); __cpu_invalidate_fpregs_state(); fpregs_unlock(); fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { if (copy_sigframe_from_user_to_xstate(tsk, buf_fx)) return false; } else { if (__copy_from_user(&fpregs->fxsave, buf_fx, sizeof(fpregs->fxsave))) return false; if (IS_ENABLED(CONFIG_X86_64)) { /* Reject invalid MXCSR values. */ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) return false; } else { /* Mask invalid bits out for historical reasons (broken hardware). */ fpregs->fxsave.mxcsr &= mxcsr_feature_mask; } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; } /* Fold the legacy FP storage */ convert_to_fxsr(&fpregs->fxsave, &env); fpregs_lock(); if (use_xsave()) { /* * Remove all UABI feature bits not set in user_xfeatures * from the memory xstate header which makes the full * restore below bring them into init state. This works for * fx_only mode as well because that has only FP and SSE * set in user_xfeatures. * * Preserve supervisor states! */ u64 mask = user_xfeatures | xfeatures_mask_supervisor(); fpregs->xsave.header.xfeatures &= mask; success = !os_xrstor_safe(fpu->fpstate, fpu_kernel_cfg.max_features); } else { success = !fxrstor_safe(&fpregs->fxsave); } if (likely(success)) fpregs_mark_activate(); fpregs_unlock(); return success; } static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { unsigned int size = fpstate->user_size; return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* * Restore FPU state from a sigframe: */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { struct fpu *fpu = &current->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; unsigned int size; if (unlikely(!buf)) { fpu__clear_user_states(fpu); return true; } size = xstate_sigframe_size(fpu->fpstate); ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); /* * Only FXSR enabled systems need the FX state quirk. * FRSTOR does not need it and can use the fast path. */ if (ia32_frame && use_fxsr()) { buf_fx = buf + sizeof(struct fregs_state); size += sizeof(struct fregs_state); ia32_fxstate = true; } if (!access_ok(buf, size)) goto out; if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { success = !fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), NULL, buf); } else { success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: if (unlikely(!success)) fpu__clear_user_states(fpu); return success; } unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { frame_size += sizeof(struct fregs_state); sp -= sizeof(struct fregs_state); } *size = frame_size; return sp; } unsigned long __init fpu__get_fpstate_size(void) { unsigned long ret = fpu_user_cfg.max_size; if (use_xsave()) ret += FP_XSTATE_MAGIC2_SIZE; /* * This space is needed on (most) 32-bit kernels, or when a 32-bit * app is running on a 64-bit kernel. To keep things simple, just * assume the worst case and always include space for 'freg_state', * even for 64-bit apps on 64-bit kernels. This wastes a bit of * space, but keeps the code simple. */ if ((IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) && use_fxsr()) ret += sizeof(struct fregs_state); return ret; }
4 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Monitoring SMC transport protocol sockets * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/smc_diag.h> #include <net/netlink.h> #include <net/smc.h> #include "smc.h" #include "smc_core.h" struct smc_diag_dump_ctx { int pos[2]; }; static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) { return (struct smc_diag_dump_ctx *)cb->ctx; } static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; if (sk->sk_protocol == SMCPROTO_SMC) { r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_protocol == SMCPROTO_SMC6) { memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, sizeof(smc->clcsock->sk->sk_v6_daddr)); #endif } } static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct smc_diag_msg *r, struct user_namespace *user_ns) { if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct smc_diag_req *req, struct nlattr *bc) { struct smc_sock *smc = smc_sk(sk); struct smc_diag_fallback fallback; struct user_namespace *user_ns; struct smc_diag_msg *r; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); smc_diag_msg_common_fill(r, sk); r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; fallback.reason = smc->fallback_rsn; fallback.peer_diagnosis = smc->peer_diagnosis; if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) goto errout; if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, .sndbuf_size = conn->sndbuf_desc ? conn->sndbuf_desc->len : 0, .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, .rx_prod.count = conn->local_rx_ctrl.prod.count, .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, .rx_cons.count = conn->local_rx_ctrl.cons.count, .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, .tx_prod.count = conn->local_tx_ctrl.prod.count, .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, .tx_cons.count = conn->local_tx_ctrl.cons.count, .tx_prod_flags = *(u8 *)&conn->local_tx_ctrl.prod_flags, .tx_conn_state_flags = *(u8 *)&conn->local_tx_ctrl.conn_state_flags, .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, .rx_conn_state_flags = *(u8 *)&conn->local_rx_ctrl.conn_state_flags, .tx_prep.wrap = conn->tx_curs_prep.wrap, .tx_prep.count = conn->tx_curs_prep.count, .tx_sent.wrap = conn->tx_curs_sent.wrap, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, }; memcpy(linfo.lnk[0].ibname, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, sizeof(link->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; struct smcd_dev *smcd = conn->lgr->smcd; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid; dinfo.my_gid = smcd->ops->get_local_gid(smcd); dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) goto errout; } nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, struct netlink_callback *cb, int p_type) { struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); head = &prot->h.smc_hash->ht; if (hlist_empty(head)) goto out; sk_for_each(sk, head) { if (!net_eq(sock_net(sk), net)) continue; if (num < snum) goto next; rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); if (rc < 0) goto out; next: num++; } out: read_unlock(&prot->h.smc_hash->lock); cb_ctx->pos[p_type] = num; return rc; } static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int rc = 0; rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); if (!rc) smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); return skb->len; } static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { struct net *net = sock_net(skb->sk); if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { { struct netlink_dump_control c = { .dump = smc_diag_dump, .min_dump_alloc = SKB_WITH_OVERHEAD(32768), }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } return 0; } static const struct sock_diag_handler smc_diag_handler = { .family = AF_SMC, .dump = smc_diag_handler_dump, }; static int __init smc_diag_init(void) { return sock_diag_register(&smc_diag_handler); } static void __exit smc_diag_exit(void) { sock_diag_unregister(&smc_diag_handler); } module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);
1 1 7 6 4 4 1 5 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Bluetooth virtual HCI driver * * Copyright (C) 2000-2001 Qualcomm Incorporated * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> * Copyright (C) 2004-2006 Marcel Holtmann <marcel@holtmann.org> */ #include <linux/module.h> #include <asm/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/poll.h> #include <linux/skbuff.h> #include <linux/miscdevice.h> #include <linux/debugfs.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #define VERSION "1.5" static bool amp; struct vhci_data { struct hci_dev *hdev; wait_queue_head_t read_wait; struct sk_buff_head readq; struct mutex open_mutex; struct delayed_work open_timeout; struct work_struct suspend_work; bool suspended; bool wakeup; __u16 msft_opcode; bool aosp_capable; }; static int vhci_open_dev(struct hci_dev *hdev) { return 0; } static int vhci_close_dev(struct hci_dev *hdev) { struct vhci_data *data = hci_get_drvdata(hdev); skb_queue_purge(&data->readq); return 0; } static int vhci_flush(struct hci_dev *hdev) { struct vhci_data *data = hci_get_drvdata(hdev); skb_queue_purge(&data->readq); return 0; } static int vhci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { struct vhci_data *data = hci_get_drvdata(hdev); memcpy(skb_push(skb, 1), &hci_skb_pkt_type(skb), 1); skb_queue_tail(&data->readq, skb); wake_up_interruptible(&data->read_wait); return 0; } static int vhci_get_data_path_id(struct hci_dev *hdev, u8 *data_path_id) { *data_path_id = 0; return 0; } static int vhci_get_codec_config_data(struct hci_dev *hdev, __u8 type, struct bt_codec *codec, __u8 *vnd_len, __u8 **vnd_data) { if (type != ESCO_LINK) return -EINVAL; *vnd_len = 0; *vnd_data = NULL; return 0; } static bool vhci_wakeup(struct hci_dev *hdev) { struct vhci_data *data = hci_get_drvdata(hdev); return data->wakeup; } static ssize_t force_suspend_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; char buf[3]; buf[0] = data->suspended ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static void vhci_suspend_work(struct work_struct *work) { struct vhci_data *data = container_of(work, struct vhci_data, suspend_work); if (data->suspended) hci_suspend_dev(data->hdev); else hci_resume_dev(data->hdev); } static ssize_t force_suspend_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; bool enable; int err; err = kstrtobool_from_user(user_buf, count, &enable); if (err) return err; if (data->suspended == enable) return -EALREADY; data->suspended = enable; schedule_work(&data->suspend_work); return count; } static const struct file_operations force_suspend_fops = { .open = simple_open, .read = force_suspend_read, .write = force_suspend_write, .llseek = default_llseek, }; static ssize_t force_wakeup_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; char buf[3]; buf[0] = data->wakeup ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static ssize_t force_wakeup_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; bool enable; int err; err = kstrtobool_from_user(user_buf, count, &enable); if (err) return err; if (data->wakeup == enable) return -EALREADY; data->wakeup = enable; return count; } static const struct file_operations force_wakeup_fops = { .open = simple_open, .read = force_wakeup_read, .write = force_wakeup_write, .llseek = default_llseek, }; static int msft_opcode_set(void *data, u64 val) { struct vhci_data *vhci = data; if (val > 0xffff || hci_opcode_ogf(val) != 0x3f) return -EINVAL; if (vhci->msft_opcode) return -EALREADY; vhci->msft_opcode = val; return 0; } static int msft_opcode_get(void *data, u64 *val) { struct vhci_data *vhci = data; *val = vhci->msft_opcode; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(msft_opcode_fops, msft_opcode_get, msft_opcode_set, "%llu\n"); static ssize_t aosp_capable_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *vhci = file->private_data; char buf[3]; buf[0] = vhci->aosp_capable ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static ssize_t aosp_capable_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *vhci = file->private_data; bool enable; int err; err = kstrtobool_from_user(user_buf, count, &enable); if (err) return err; if (!enable) return -EINVAL; if (vhci->aosp_capable) return -EALREADY; vhci->aosp_capable = enable; return count; } static const struct file_operations aosp_capable_fops = { .open = simple_open, .read = aosp_capable_read, .write = aosp_capable_write, .llseek = default_llseek, }; static int vhci_setup(struct hci_dev *hdev) { struct vhci_data *vhci = hci_get_drvdata(hdev); if (vhci->msft_opcode) hci_set_msft_opcode(hdev, vhci->msft_opcode); if (vhci->aosp_capable) hci_set_aosp_capable(hdev); return 0; } static void vhci_coredump(struct hci_dev *hdev) { /* No need to do anything */ } static void vhci_coredump_hdr(struct hci_dev *hdev, struct sk_buff *skb) { char buf[80]; snprintf(buf, sizeof(buf), "Controller Name: vhci_ctrl\n"); skb_put_data(skb, buf, strlen(buf)); snprintf(buf, sizeof(buf), "Firmware Version: vhci_fw\n"); skb_put_data(skb, buf, strlen(buf)); snprintf(buf, sizeof(buf), "Driver: vhci_drv\n"); skb_put_data(skb, buf, strlen(buf)); snprintf(buf, sizeof(buf), "Vendor: vhci\n"); skb_put_data(skb, buf, strlen(buf)); } #define MAX_COREDUMP_LINE_LEN 40 struct devcoredump_test_data { enum devcoredump_state state; unsigned int timeout; char data[MAX_COREDUMP_LINE_LEN]; }; static inline void force_devcd_timeout(struct hci_dev *hdev, unsigned int timeout) { #ifdef CONFIG_DEV_COREDUMP hdev->dump.timeout = msecs_to_jiffies(timeout * 1000); #endif } static ssize_t force_devcd_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; struct hci_dev *hdev = data->hdev; struct sk_buff *skb = NULL; struct devcoredump_test_data dump_data; size_t data_size; int ret; if (count < offsetof(struct devcoredump_test_data, data) || count > sizeof(dump_data)) return -EINVAL; if (copy_from_user(&dump_data, user_buf, count)) return -EFAULT; data_size = count - offsetof(struct devcoredump_test_data, data); skb = alloc_skb(data_size, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_put_data(skb, &dump_data.data, data_size); hci_devcd_register(hdev, vhci_coredump, vhci_coredump_hdr, NULL); /* Force the devcoredump timeout */ if (dump_data.timeout) force_devcd_timeout(hdev, dump_data.timeout); ret = hci_devcd_init(hdev, skb->len); if (ret) { BT_ERR("Failed to generate devcoredump"); kfree_skb(skb); return ret; } hci_devcd_append(hdev, skb); switch (dump_data.state) { case HCI_DEVCOREDUMP_DONE: hci_devcd_complete(hdev); break; case HCI_DEVCOREDUMP_ABORT: hci_devcd_abort(hdev); break; case HCI_DEVCOREDUMP_TIMEOUT: /* Do nothing */ break; default: return -EINVAL; } return count; } static const struct file_operations force_devcoredump_fops = { .open = simple_open, .write = force_devcd_write, }; static int __vhci_create_device(struct vhci_data *data, __u8 opcode) { struct hci_dev *hdev; struct sk_buff *skb; __u8 dev_type; if (data->hdev) return -EBADFD; /* bits 0-1 are dev_type (Primary or AMP) */ dev_type = opcode & 0x03; if (dev_type != HCI_PRIMARY && dev_type != HCI_AMP) return -EINVAL; /* bits 2-5 are reserved (must be zero) */ if (opcode & 0x3c) return -EINVAL; skb = bt_skb_alloc(4, GFP_KERNEL); if (!skb) return -ENOMEM; hdev = hci_alloc_dev(); if (!hdev) { kfree_skb(skb); return -ENOMEM; } data->hdev = hdev; hdev->bus = HCI_VIRTUAL; hdev->dev_type = dev_type; hci_set_drvdata(hdev, data); hdev->open = vhci_open_dev; hdev->close = vhci_close_dev; hdev->flush = vhci_flush; hdev->send = vhci_send_frame; hdev->get_data_path_id = vhci_get_data_path_id; hdev->get_codec_config_data = vhci_get_codec_config_data; hdev->wakeup = vhci_wakeup; hdev->setup = vhci_setup; set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); /* bit 6 is for external configuration */ if (opcode & 0x40) set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); /* bit 7 is for raw device */ if (opcode & 0x80) set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); hci_free_dev(hdev); data->hdev = NULL; kfree_skb(skb); return -EBUSY; } debugfs_create_file("force_suspend", 0644, hdev->debugfs, data, &force_suspend_fops); debugfs_create_file("force_wakeup", 0644, hdev->debugfs, data, &force_wakeup_fops); if (IS_ENABLED(CONFIG_BT_MSFTEXT)) debugfs_create_file("msft_opcode", 0644, hdev->debugfs, data, &msft_opcode_fops); if (IS_ENABLED(CONFIG_BT_AOSPEXT)) debugfs_create_file("aosp_capable", 0644, hdev->debugfs, data, &aosp_capable_fops); debugfs_create_file("force_devcoredump", 0644, hdev->debugfs, data, &force_devcoredump_fops); hci_skb_pkt_type(skb) = HCI_VENDOR_PKT; skb_put_u8(skb, 0xff); skb_put_u8(skb, opcode); put_unaligned_le16(hdev->id, skb_put(skb, 2)); skb_queue_tail(&data->readq, skb); wake_up_interruptible(&data->read_wait); return 0; } static int vhci_create_device(struct vhci_data *data, __u8 opcode) { int err; mutex_lock(&data->open_mutex); err = __vhci_create_device(data, opcode); mutex_unlock(&data->open_mutex); return err; } static inline ssize_t vhci_get_user(struct vhci_data *data, struct iov_iter *from) { size_t len = iov_iter_count(from); struct sk_buff *skb; __u8 pkt_type, opcode; int ret; if (len < 2 || len > HCI_MAX_FRAME_SIZE) return -EINVAL; skb = bt_skb_alloc(len, GFP_KERNEL); if (!skb) return -ENOMEM; if (!copy_from_iter_full(skb_put(skb, len), len, from)) { kfree_skb(skb); return -EFAULT; } pkt_type = *((__u8 *) skb->data); skb_pull(skb, 1); switch (pkt_type) { case HCI_EVENT_PKT: case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: if (!data->hdev) { kfree_skb(skb); return -ENODEV; } hci_skb_pkt_type(skb) = pkt_type; ret = hci_recv_frame(data->hdev, skb); break; case HCI_VENDOR_PKT: cancel_delayed_work_sync(&data->open_timeout); opcode = *((__u8 *) skb->data); skb_pull(skb, 1); if (skb->len > 0) { kfree_skb(skb); return -EINVAL; } kfree_skb(skb); ret = vhci_create_device(data, opcode); break; default: kfree_skb(skb); return -EINVAL; } return (ret < 0) ? ret : len; } static inline ssize_t vhci_put_user(struct vhci_data *data, struct sk_buff *skb, char __user *buf, int count) { char __user *ptr = buf; int len; len = min_t(unsigned int, skb->len, count); if (copy_to_user(ptr, skb->data, len)) return -EFAULT; if (!data->hdev) return len; data->hdev->stat.byte_tx += len; switch (hci_skb_pkt_type(skb)) { case HCI_COMMAND_PKT: data->hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: data->hdev->stat.acl_tx++; break; case HCI_SCODATA_PKT: data->hdev->stat.sco_tx++; break; } return len; } static ssize_t vhci_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct vhci_data *data = file->private_data; struct sk_buff *skb; ssize_t ret = 0; while (count) { skb = skb_dequeue(&data->readq); if (skb) { ret = vhci_put_user(data, skb, buf, count); if (ret < 0) skb_queue_head(&data->readq, skb); else kfree_skb(skb); break; } if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } ret = wait_event_interruptible(data->read_wait, !skb_queue_empty(&data->readq)); if (ret < 0) break; } return ret; } static ssize_t vhci_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhci_data *data = file->private_data; return vhci_get_user(data, from); } static __poll_t vhci_poll(struct file *file, poll_table *wait) { struct vhci_data *data = file->private_data; poll_wait(file, &data->read_wait, wait); if (!skb_queue_empty(&data->readq)) return EPOLLIN | EPOLLRDNORM; return EPOLLOUT | EPOLLWRNORM; } static void vhci_open_timeout(struct work_struct *work) { struct vhci_data *data = container_of(work, struct vhci_data, open_timeout.work); vhci_create_device(data, amp ? HCI_AMP : HCI_PRIMARY); } static int vhci_open(struct inode *inode, struct file *file) { struct vhci_data *data; data = kzalloc(sizeof(struct vhci_data), GFP_KERNEL); if (!data) return -ENOMEM; skb_queue_head_init(&data->readq); init_waitqueue_head(&data->read_wait); mutex_init(&data->open_mutex); INIT_DELAYED_WORK(&data->open_timeout, vhci_open_timeout); INIT_WORK(&data->suspend_work, vhci_suspend_work); file->private_data = data; nonseekable_open(inode, file); schedule_delayed_work(&data->open_timeout, msecs_to_jiffies(1000)); return 0; } static int vhci_release(struct inode *inode, struct file *file) { struct vhci_data *data = file->private_data; struct hci_dev *hdev; cancel_delayed_work_sync(&data->open_timeout); flush_work(&data->suspend_work); hdev = data->hdev; if (hdev) { hci_unregister_dev(hdev); hci_free_dev(hdev); } skb_queue_purge(&data->readq); file->private_data = NULL; kfree(data); return 0; } static const struct file_operations vhci_fops = { .owner = THIS_MODULE, .read = vhci_read, .write_iter = vhci_write, .poll = vhci_poll, .open = vhci_open, .release = vhci_release, .llseek = no_llseek, }; static struct miscdevice vhci_miscdev = { .name = "vhci", .fops = &vhci_fops, .minor = VHCI_MINOR, }; module_misc_device(vhci_miscdev); module_param(amp, bool, 0644); MODULE_PARM_DESC(amp, "Create AMP controller device"); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth virtual HCI driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS("devname:vhci"); MODULE_ALIAS_MISCDEV(VHCI_MINOR);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #ifndef _NET_SEG6_H #define _NET_SEG6_H #include <linux/net.h> #include <linux/ipv6.h> #include <linux/seg6.h> #include <linux/rhashtable-types.h> static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) { __be32 diff[] = { ~from, to }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, __be32 *to) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], to[0], to[1], to[2], to[3], }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } struct seg6_pernet_data { struct mutex lock; struct in6_addr __rcu *tun_src; #ifdef CONFIG_IPV6_SEG6_HMAC struct rhashtable hmac_infos; #endif }; static inline struct seg6_pernet_data *seg6_pernet(struct net *net) { #if IS_ENABLED(CONFIG_IPV6) return net->ipv6.seg6_data; #else return NULL; #endif } extern int seg6_init(void); extern void seg6_exit(void); extern int seg6_iptunnel_init(void); extern void seg6_iptunnel_exit(void); extern int seg6_local_init(void); extern void seg6_local_exit(void); extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); extern struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags); extern void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); /* If the packet which invoked an ICMP error contains an SRH return * the true destination address from within the SRH, otherwise use the * destination address in the IP header. */ static inline const struct in6_addr *seg6_get_daddr(struct sk_buff *skb, struct inet6_skb_parm *opt) { struct ipv6_sr_hdr *srh; if (opt->flags & IP6SKB_SEG6) { srh = (struct ipv6_sr_hdr *)(skb->data + opt->srhoff); return &srh->segments[0]; } return NULL; } #endif
2 497 479 115 103 6 103 103 115 497 498 498 498 498 497 3 2 1 3 103 6 103 103 500 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; }
108 148 302 148 54 154 143 1 242 4072 48 284 174 5 716 197 86 86 86 86 86 86 86 576 485 382 37 174 103 1008 34 31 29 39 3 19 42 2 980 1 4461 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_H #define _LINUX_BPF_H 1 #include <uapi/linux/bpf.h> #include <uapi/linux/filter.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/percpu.h> #include <linux/err.h> #include <linux/rbtree_latch.h> #include <linux/numa.h> #include <linux/mm_types.h> #include <linux/wait.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/capability.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/stddef.h> #include <linux/bpfptr.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/static_call.h> #include <linux/memcontrol.h> struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; struct btf; struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; extern struct bpf_mem_alloc bpf_global_ma; extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); typedef unsigned int (*bpf_func_t)(const void *, const struct bpf_insn *); struct bpf_iter_seq_info { const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; }; /* map is generic key/value storage optionally accessible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_update_batch)(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); long (*map_delete_elem)(struct bpf_map *map, void *key); long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); /* Prog poke tracking helpers. */ int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new); /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); /* Functions called by bpf_local_storage maps */ int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, void *owner, u32 size); void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. * * Some properties of the inner map has been used during the * verification time. When inserting an inner map at the runtime, * map_meta_equal has to ensure the inserting map has the same * properties that the verifier has used earlier. */ bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); u64 (*map_mem_usage)(const struct bpf_map *map); /* BTF id of struct allocated by map_alloc */ int *map_btf_id; /* bpf_iter info used to open a seq_file */ const struct bpf_iter_seq_info *iter_seq_info; }; enum { /* Support at most 10 fields in a BTF type */ BTF_FIELDS_MAX = 10, }; enum btf_field_type { BPF_SPIN_LOCK = (1 << 0), BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), BPF_LIST_NODE = (1 << 5), BPF_RB_ROOT = (1 << 6), BPF_RB_NODE = (1 << 7), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, BPF_REFCOUNT = (1 << 8), }; typedef void (*btf_dtor_kfunc_t)(void *); struct btf_field_kptr { struct btf *btf; struct module *module; /* dtor used if btf_is_kernel(btf), otherwise the type is * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used */ btf_dtor_kfunc_t dtor; u32 btf_id; }; struct btf_field_graph_root { struct btf *btf; u32 value_btf_id; u32 node_offset; struct btf_record *value_rec; }; struct btf_field { u32 offset; u32 size; enum btf_field_type type; union { struct btf_field_kptr kptr; struct btf_field_graph_root graph_root; }; }; struct btf_record { u32 cnt; u32 field_mask; int spin_lock_off; int timer_off; int refcount_off; struct btf_field fields[]; }; /* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ struct bpf_rb_node_kern { struct rb_node rb_node; void *owner; } __attribute__((aligned(8))); /* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ struct bpf_list_node_kern { struct list_head list_head; void *owner; } __attribute__((aligned(8))); struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ const struct bpf_map_ops *ops ____cacheline_aligned; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; #endif enum bpf_map_type map_type; u32 key_size; u32 value_size; u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; u32 id; struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is * stored in the map to make sure that all callers and callees have the * same prog type, JITed flag and xdp_has_frags flag. */ struct { spinlock_t lock; enum bpf_prog_type type; bool jited; bool xdp_has_frags; } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ s64 __percpu *elem_count; }; static inline const char *btf_field_type_name(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return "bpf_spin_lock"; case BPF_TIMER: return "bpf_timer"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: return "bpf_list_node"; case BPF_RB_ROOT: return "bpf_rb_root"; case BPF_RB_NODE: return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; default: WARN_ON_ONCE(1); return "unknown"; } } static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return sizeof(struct bpf_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); case BPF_LIST_NODE: return sizeof(struct bpf_list_node); case BPF_RB_ROOT: return sizeof(struct bpf_rb_root); case BPF_RB_NODE: return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; } } static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return __alignof__(struct bpf_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); case BPF_LIST_NODE: return __alignof__(struct bpf_list_node); case BPF_RB_ROOT: return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; } } static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) { memset(addr, 0, field->size); switch (field->type) { case BPF_REFCOUNT: refcount_set((refcount_t *)addr, 1); break; case BPF_RB_NODE: RB_CLEAR_NODE((struct rb_node *)addr); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: INIT_LIST_HEAD((struct list_head *)addr); break; case BPF_RB_ROOT: /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_KPTR_UNREF: case BPF_KPTR_REF: break; default: WARN_ON_ONCE(1); return; } } static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) { if (IS_ERR_OR_NULL(rec)) return false; return rec->field_mask & type; } static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset); } /* 'dst' must be a temporary buffer and should not point to memory that is being * used in parallel by a bpf program or bpf syscall, otherwise the access from * the bpf program or bpf syscall may be corrupted by the reinitialization, * leading to weird problems. Even 'dst' is newly-allocated from bpf memory * allocator, it is still possible for 'dst' to be used in parallel by a bpf * program or bpf syscall. */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { bpf_obj_init(map->record, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent * updates from BPF programs. Called from bpf syscall and mostly used with * size 8 or 16 bytes, so ask compiler to inline it. */ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) { const long *lsrc = src; long *ldst = dst; size /= sizeof(long); while (size--) data_race(*ldst++ = *lsrc++); } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ static inline void bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy) { u32 curr_off = 0; int i; if (IS_ERR_OR_NULL(rec)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(size, 8)); else memcpy(dst, src, size); return; } for (i = 0; i < rec->cnt; i++) { u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); curr_off += rec->fields[i].size + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { bpf_obj_memcpy(map->record, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; int i; if (IS_ERR_OR_NULL(rec)) { memset(dst, 0, size); return; } for (i = 0; i < rec->cnt; i++) { u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); curr_off += rec->fields[i].size + sz; } memset(dst + curr_off, 0, size - curr_off); } static inline void zero_map_value(struct bpf_map *map, void *dst) { bpf_obj_memzero(map->record, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { int (*map_get_next_key)(struct bpf_offloaded_map *map, void *key, void *next_key); int (*map_lookup_elem)(struct bpf_offloaded_map *map, void *key, void *value); int (*map_update_elem)(struct bpf_offloaded_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); }; struct bpf_offloaded_map { struct bpf_map map; struct net_device *netdev; const struct bpf_map_dev_ops *dev_ops; void *dev_priv; struct list_head offloads; }; static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) { return container_of(map, struct bpf_offloaded_map, map); } static inline bool bpf_map_offload_neutral(const struct bpf_map *map) { return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; } static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) && map->ops->map_seq_show_elem; } int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of * arg_type, ret_type and reg_type. For example, a pointer value may be null, * or a memory is read-only. We classify types into two categories: base types * and extended types. Extended types are base types combined with a type flag. * * Currently there are no more than 32 base types in arg_type, ret_type and * reg_types. */ #define BPF_BASE_TYPE_BITS 8 enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), /* MEM is read-only. When applied on bpf_arg, it indicates the arg is * compatible with both mutable and immutable memory. */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), /* MEM points to BPF ring buffer reservation. */ MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS), /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In * order to drop this tag, it must be passed into bpf_per_cpu_ptr() * or bpf_this_cpu_ptr(), which will return the pointer corresponding * to the specified cpu. */ MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), /* Indicates that the argument will be released. */ OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark * unreferenced and referenced kptr loaded from map value using a load * instruction, so that they can only be dereferenced but not escape the * BPF program into the kernel (i.e. cannot be passed as arguments to * kfunc or bpf helpers). */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), /* MEM is of an allocated object of type in program BTF. This is used to * tag PTR_TO_BTF_ID allocated using bpf_obj_new. */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is * whether a pointer is safe to pass to a kfunc or BPF helper function. * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF * helpers, they do not cover all possible instances of unsafe * pointers. For example, a pointer that was obtained from walking a * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the * fact that it may be NULL, invalid, etc. This is due to backwards * compatibility requirements, as this was the behavior that was first * introduced when kptrs were added. The behavior is now considered * deprecated, and PTR_UNTRUSTED will eventually be removed. * * PTR_TRUSTED, on the other hand, is a pointer that the kernel * guarantees to be valid and safe to pass to kfuncs and BPF helpers. * For example, pointers passed to tracepoint arguments are considered * PTR_TRUSTED, as are pointers that are passed to struct_ops * callbacks. As alluded to above, pointers that are obtained from * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a * struct task_struct *task is PTR_TRUSTED, then accessing * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored * in a BPF register. Similarly, pointers passed to certain programs * types such as kretprobes are not guaranteed to be valid, as they may * for example contain an object that was recently freed. */ PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. * Currently only valid for linked-list and rbtree nodes. If the nodes * have a bpf_refcount_field, they must be tagged MEM_RCU as well. */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), /* DYNPTR points to sk_buff */ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), /* DYNPTR points to xdp_buff */ DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ | DYNPTR_TYPE_XDP) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) /* Max number of all types. */ #define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ /* the following constraints used to prototype * bpf_map_lookup/update/delete_elem() functions */ ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ /* Used to prototype bpf_memcmp() and other functions that access data * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* pointer to memory does not need to be initialized, helper function must fill * all bytes or clear them in error case. */ ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */ ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ RET_PTR_TO_MEM, /* returns a pointer to memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, /* Extended ret_types. */ RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying */ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; bool might_sleep; enum bpf_return_type ret_type; union { struct { enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; enum bpf_arg_type arg3_type; enum bpf_arg_type arg4_type; enum bpf_arg_type arg5_type; }; enum bpf_arg_type arg_type[5]; }; union { struct { u32 *arg1_btf_id; u32 *arg2_btf_id; u32 *arg3_btf_id; u32 *arg4_btf_id; u32 *arg5_btf_id; }; u32 *arg_btf_id[5]; struct { size_t arg1_size; size_t arg2_size; size_t arg3_size; size_t arg4_size; size_t arg5_size; }; size_t arg_size[5]; }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is * the first argument to eBPF programs. * For socket filters: 'struct bpf_context *' == 'struct sk_buff *' */ struct bpf_context; enum bpf_access_type { BPF_READ = 1, BPF_WRITE = 2 }; /* types of values stored in eBPF registers */ /* Pointer types represent: * pointer * pointer + imm * pointer + (u16) var * pointer + (u16) var + imm * if (range > 0) then [ptr, ptr + range - off) is safe to access * if (id > 0) means that some 'var' was added * if (off > 0) means that 'imm' was added */ enum bpf_reg_type { NOT_INIT = 0, /* nothing was written into register */ SCALAR_VALUE, /* reg doesn't contain a valid pointer */ PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need * to be null checked by the BPF program. This does not imply the * pointer is _not_ null and in practice this can easily be a null * pointer when reading pointer chains. The assumption is program * context will handle null pointer dereference typically via fault * handling. The verifier must keep this in mind and can make no * assumptions about null or non-null when doing branch analysis. * Further, when passed into helpers the helpers can not, without * additional context, assume the value is non-null. */ PTR_TO_BTF_ID, /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; union { int ctx_field_size; struct { struct btf *btf; u32 btf_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ }; static inline void bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) { aux->ctx_field_size = size; } static inline bool bpf_pseudo_func(const struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && insn->src_reg == BPF_PSEUDO_FUNC; } struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); }; struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * (*get_func_proto)(enum bpf_func_id func_id, const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); }; struct bpf_prog_offload_ops { /* verifier basic callbacks */ int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); /* verifier optimization callbacks (called after .finalize) */ int (*replace_insn)(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn); int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt); /* program management callbacks */ int (*prepare)(struct bpf_prog *prog); int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; struct bpf_offload_dev *offdev; void *dev_priv; struct list_head offloads; bool dev_state; bool opt_failed; void *jited_image; u32 jited_len; }; enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, BPF_CGROUP_STORAGE_PERCPU, __BPF_CGROUP_STORAGE_MAX }; #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX /* The longest tracepoint has 12 args. * See include/trace/bpf_probe.h */ #define MAX_BPF_FUNC_ARGS 12 /* The maximum number of arguments passed through registers * a single function may have. */ #define MAX_BPF_FUNC_REG_ARGS 5 /* The argument is a structure. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ #define BTF_FMODEL_SIGNED_ARG BIT(1) struct btf_func_model { u8 ret_size; u8 ret_flags; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function * continue executing. This flag is used for fentry progs when there are no * fexit progs. */ #define BPF_TRAMP_F_RESTORE_REGS BIT(0) /* Call original function after fentry progs, but before fexit progs. * Makes sense for fentry/fexit, normal calls and indirect calls. */ #define BPF_TRAMP_F_CALL_ORIG BIT(1) /* Skip current frame and return to parent. Makes sense for fentry/fexit * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) /* Return the return value of fentry prog. Only used by bpf_struct_ops. */ #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Get original function from stack instead of from provided direct address. * Makes sense for trampolines with fexit or fmod_ret programs. */ #define BPF_TRAMP_F_ORIG_STACK BIT(5) /* This trampoline is on a function with another ftrace_ops with IPMODIFY, * e.g., a live patch. This flag is set and cleared by ftrace call backs, */ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ enum { #if defined(__s390x__) BPF_MAX_TRAMP_LINKS = 27, #else BPF_MAX_TRAMP_LINKS = 38, #endif }; struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; }; struct bpf_tramp_run_ctx; /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS * fentry = a set of programs to run before returning from trampoline * * 2. replace nop at the function entry (kprobe + kretprobe equivalent) * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME * orig_call = fentry_ip + MCOUNT_INSN_SIZE * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function * * 3. replace direct call instruction anywhere in the function body * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) * With flags = 0 * fentry = a set of programs to run before returning from trampoline * With flags = BPF_TRAMP_F_CALL_ORIG * orig_call = original callback addr or direct function addr * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); struct bpf_ksym { unsigned long start; unsigned long end; char name[KSYM_NAME_LEN]; struct list_head lnode; struct latch_tree_node tnode; bool prog; }; enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ }; struct bpf_tramp_image { void *image; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; void *ip_epilogue; union { struct rcu_head rcu; struct work_struct work; }; }; struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; struct { struct btf_func_model model; void *addr; bool ftrace_managed; } func; /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF * program by replacing one of its functions. func.addr is the address * of the function it replaced. */ struct bpf_prog *extension_prog; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; struct module *mod; }; struct bpf_attach_target_info { struct btf_func_model fmodel; long tgt_addr; struct module *tgt_mod; const char *tgt_name; const struct btf_type *tgt_type; }; #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ struct bpf_dispatcher_prog { struct bpf_prog *prog; refcount_t users; }; struct bpf_dispatcher { /* dispatcher mutex */ struct mutex mutex; void *func; struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; void *rw_image; u32 image_off; struct bpf_ksym ksym; #ifdef CONFIG_HAVE_STATIC_CALL struct static_call_key *sc_key; void *sc_tramp; #endif }; static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) { return bpf_func(ctx, insnsi); } /* the implementation of the opaque uapi struct bpf_dynptr */ struct bpf_dynptr_kern { void *data; /* Size represents the number of usable bytes of dynptr data. * If for example the offset is at 4 for a local dynptr whose data is * of type u64, the number of usable bytes is 4. * * The upper 8 bits are reserved. It is as follows: * Bits 0 - 23 = size * Bits 24 - 30 = dynptr type * Bit 31 = whether dynptr is read-only */ u32 size; u32 offset; } __aligned(8); enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */ BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, }; int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); /* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does * not have STATIC_CALL, avoid a double-indirection. */ #ifdef CONFIG_HAVE_STATIC_CALL #define __BPF_DISPATCHER_SC_INIT(_name) \ .sc_key = &STATIC_CALL_KEY(_name), \ .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name), #define __BPF_DISPATCHER_SC(name) \ DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func) #define __BPF_DISPATCHER_CALL(name) \ static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func) #define __BPF_DISPATCHER_UPDATE(_d, _new) \ __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new)) #else #define __BPF_DISPATCHER_SC_INIT(name) #define __BPF_DISPATCHER_SC(name) #define __BPF_DISPATCHER_CALL(name) bpf_func(ctx, insnsi) #define __BPF_DISPATCHER_UPDATE(_d, _new) #endif #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ .progs = {}, \ .num_progs = 0, \ .image = NULL, \ .image_off = 0, \ .ksym = { \ .name = #_name, \ .lnode = LIST_HEAD_INIT(_name.ksym.lnode), \ }, \ __BPF_DISPATCHER_SC_INIT(_name##_call) \ } #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ { \ return __BPF_DISPATCHER_CALL(name); \ } \ EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \ struct bpf_dispatcher bpf_dispatcher_##name = \ BPF_DISPATCHER_INIT(bpf_dispatcher_##name); #define DECLARE_BPF_DISPATCHER(name) \ unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func); \ extern struct bpf_dispatcher bpf_dispatcher_##name; #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name) void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { return NULL; } static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) #define DECLARE_BPF_DISPATCHER(name) #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func #define BPF_DISPATCHER_PTR(name) NULL static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) {} static inline bool is_bpf_image_address(unsigned long address) { return false; } static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { return false; } #endif struct bpf_func_info_aux { u16 linkage; bool unreliable; }; enum bpf_jit_poke_reason { BPF_POKE_REASON_TAIL_CALL, }; /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; void *aux; union { struct { struct bpf_map *map; u32 key; } tail_call; }; bool tailcall_target_stable; u8 adj_off; u16 reason; u32 insn_idx; }; /* reg_type info for ctx arguments */ struct bpf_ctx_arg_aux { u32 offset; enum bpf_reg_type reg_type; u32 btf_id; }; struct btf_mod_pair { struct btf *btf; struct module *module; }; struct bpf_kfunc_desc_tab; struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; u32 used_btf_cnt; u32 max_ctx_offset; u32 max_pkt_offset; u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; enum bpf_prog_type saved_dst_prog_type; enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; bool xdp_has_frags; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ u32 verified_insns; int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; #endif struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; struct bpf_func_info_aux *func_info_aux; /* bpf_line_info loaded from userspace. linfo->insn_off * has the xlated insn offset. * Both the main and sub prog share the same linfo. * The subprog can access its first linfo by * using the linfo_idx. */ struct bpf_line_info *linfo; /* jited_linfo is the jited addr of the linfo. It has a * one to one mapping to linfo: * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. * Both the main and sub prog share the same jited_linfo. * The subprog can access its first jited_linfo by * using the linfo_idx. */ void **jited_linfo; u32 func_info_cnt; u32 nr_linfo; /* subprog can use linfo_idx to access its first linfo and * jited_linfo. * main prog always has linfo_idx == 0 */ u32 linfo_idx; struct module *mod; u32 num_exentries; struct exception_table_entry *extable; union { struct work_struct work; struct rcu_head rcu; }; }; struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ blinding_requested:1, /* needs constant blinding */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ union { DECLARE_FLEX_ARRAY(struct sock_filter, insns); DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); }; }; struct bpf_array_aux { /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; struct mutex poke_mutex; struct work_struct work; }; struct bpf_link { atomic64_t refcnt; u32 id; enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *old_map); }; struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; u64 cookie; }; struct bpf_shim_tramp_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; }; struct bpf_tracing_link { struct bpf_tramp_link link; enum bpf_attach_type attach_type; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; struct bpf_link_primer { struct bpf_link *link; struct file *file; int fd; u32 id; }; struct bpf_struct_ops_value; struct btf_member; #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 /** * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to * define a BPF_MAP_TYPE_STRUCT_OPS map type composed * of BPF_PROG_TYPE_STRUCT_OPS progs. * @verifier_ops: A structure of callbacks that are invoked by the verifier * when determining whether the struct_ops progs in the * struct_ops map are valid. * @init: A callback that is invoked a single time, and before any other * callback, to initialize the structure. A nonzero return value means * the subsystem could not be initialized. * @check_member: When defined, a callback invoked by the verifier to allow * the subsystem to determine if an entry in the struct_ops map * is valid. A nonzero return value means that the map is * invalid and should be rejected by the verifier. * @init_member: A callback that is invoked for each member of the struct_ops * map to allow the subsystem to initialize the member. A nonzero * value means the member could not be initialized. This callback * is exclusive with the @type, @type_id, @value_type, and * @value_id fields. * @reg: A callback that is invoked when the struct_ops map has been * initialized and is being attached to. Zero means the struct_ops map * has been successfully registered and is live. A nonzero return value * means the struct_ops map could not be registered. * @unreg: A callback that is invoked when the struct_ops map should be * unregistered. * @update: A callback that is invoked when the live struct_ops map is being * updated to contain new values. This callback is only invoked when * the struct_ops map is loaded with BPF_F_LINK. If not defined, the * it is assumed that the struct_ops map cannot be updated. * @validate: A callback that is invoked after all of the members have been * initialized. This callback should perform static checks on the * map, meaning that it should either fail or succeed * deterministically. A struct_ops map that has been validated may * not necessarily succeed in being registered if the call to @reg * fails. For example, a valid struct_ops map may be loaded, but * then fail to be registered due to there being another active * struct_ops map on the system in the subsystem already. For this * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. * @type: BTF type. * @value_type: Value type. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models * @type_id: BTF type id. * @value_id: BTF value id. */ struct bpf_struct_ops { const struct bpf_verifier_ops *verifier_ops; int (*init)(struct btf *btf); int (*check_member)(const struct btf_type *t, const struct btf_member *member, const struct bpf_prog *prog); int (*init_member)(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; const char *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) return bpf_struct_ops_get(data); else return try_module_get(owner); } static inline void bpf_module_put(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) bpf_struct_ops_put(data); else module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ struct bpf_dummy_ops_state { int val; }; struct bpf_dummy_ops { int (*test_1)(struct bpf_dummy_ops_state *cb); int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2, char a3, unsigned long a4); int (*test_sleepable)(struct bpf_dummy_ops_state *cb); }; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { return NULL; } static inline void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) { } static inline bool bpf_try_module_get(const void *data, struct module *owner) { return try_module_get(owner); } static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) { return -EINVAL; } static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype); void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype) { return -EOPNOTSUPP; } static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) { } #endif struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; struct bpf_array_aux *aux; union { DECLARE_FLEX_ARRAY(char, value) __aligned(8); DECLARE_FLEX_ARRAY(void *, ptrs) __aligned(8); DECLARE_FLEX_ARRAY(void __percpu *, pptrs) __aligned(8); }; }; #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 /* Maximum number of loops for bpf_loop and bpf_iter_num. * It's enum to expose it (and thus make it discoverable) through BTF. */ enum { BPF_MAX_LOOPS = 8 * 1024 * 1024, }; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ BPF_F_WRONLY | \ BPF_F_WRONLY_PROG) #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) /* Maximum number of user-producer ring buffer samples that can be drained in * a call to bpf_user_ringbuf_drain(). */ #define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is * not possible. */ if (access_flags & BPF_F_RDONLY_PROG) return BPF_MAP_CAN_READ; else if (access_flags & BPF_F_WRONLY_PROG) return BPF_MAP_CAN_WRITE; else return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; } static inline bool bpf_map_flags_access_ok(u32 access_flags) { return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); } struct bpf_event_entry { struct perf_event *event; struct file *perf_file; struct file *map_file; struct rcu_head rcu; }; static inline bool map_type_contains_progs(struct bpf_map *map) { return map->map_type == BPF_MAP_TYPE_PROG_ARRAY || map->map_type == BPF_MAP_TYPE_DEVMAP || map->map_type == BPF_MAP_TYPE_CPUMAP; } bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); /* an array of programs to be executed under rcu_lock. * * Typical usage: * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. * The user has to keep refcnt on the program and make sure the program * is removed from the array before bpf_prog_put(). * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ struct bpf_prog_array_item { struct bpf_prog *prog; union { struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; u64 bpf_cookie; }; }; struct bpf_prog_array { struct rcu_head rcu; struct bpf_prog_array_item items[]; }; struct bpf_empty_prog_array { struct bpf_prog_array hdr; struct bpf_prog *null_prog; }; /* to avoid allocating empty bpf_prog_array for cgroups that * don't have bpf program attached use one global 'bpf_empty_prog_array' * It will not be modified the caller of bpf_prog_array_alloc() * (since caller requested prog_cnt == 0) * that pointer should be 'freed' by bpf_prog_array_free() */ extern struct bpf_empty_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); /* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index); int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog); int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; const struct bpf_prog_array_item *prog_item; int retval; }; struct bpf_trace_run_ctx { struct bpf_run_ctx run_ctx; u64 bpf_cookie; bool is_uprobe; }; struct bpf_tramp_run_ctx { struct bpf_run_ctx run_ctx; u64 bpf_cookie; struct bpf_run_ctx *saved_run_ctx; }; static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; #ifdef CONFIG_BPF_SYSCALL old_ctx = current->bpf_ctx; current->bpf_ctx = new_ctx; #endif return old_ctx; } static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) { #ifdef CONFIG_BPF_SYSCALL current->bpf_ctx = old_ctx; #endif } /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); static __always_inline u32 bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); if (unlikely(!array)) return ret; run_ctx.is_uprobe = false; migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } bpf_reset_run_ctx(old_run_ctx); migrate_enable(); return ret; } /* Notes on RCU design for bpf_prog_arrays containing sleepable programs: * * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array * overall. As a result, we must use the bpf_prog_array_free_sleepable * in order to use the tasks_trace rcu grace period. * * When a non-sleepable program is inside the array, we take the rcu read * section and disable preemption for that program alone, so it can access * rcu-protected dynamically sized maps. */ static __always_inline u32 bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; might_fault(); rcu_read_lock_trace(); migrate_disable(); run_ctx.is_uprobe = true; array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); if (unlikely(!array)) goto out; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { if (!prog->aux->sleepable) rcu_read_lock(); run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; if (!prog->aux->sleepable) rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); out: migrate_enable(); rcu_read_unlock_trace(); return ret; } #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, * kprobes, tracepoints) to prevent deadlocks on map operations as any of * these events can happen inside a region which holds a map bucket lock * and can deadlock on it. */ static inline void bpf_disable_instrumentation(void) { migrate_disable(); this_cpu_inc(bpf_prog_active); } static inline void bpf_enable_instrumentation(void) { this_cpu_dec(bpf_prog_active); migrate_enable(); } extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog); void bpf_map_free_id(struct bpf_map *map); struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, u32 field_mask); void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); #ifdef CONFIG_MEMCG_KMEM void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags); void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else static inline void * bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags, node); } static inline void * bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { return kzalloc(size, flags); } static inline void * bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) { return kvcalloc(n, size, flags); } static inline void __percpu * bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { return __alloc_percpu_gfp(size, align, flags); } #endif static inline int bpf_map_init_elem_count(struct bpf_map *map) { size_t size = sizeof(*map->elem_count), align = size; gfp_t flags = GFP_USER | __GFP_NOWARN; map->elem_count = bpf_map_alloc_percpu(map, size, align, flags); if (!map->elem_count) return -ENOMEM; return 0; } static inline void bpf_map_free_elem_count(struct bpf_map *map) { free_percpu(map->elem_count); } static inline void bpf_map_inc_elem_count(struct bpf_map *map) { this_cpu_inc(*map->elem_count); } static inline void bpf_map_dec_elem_count(struct bpf_map *map) { this_cpu_dec(*map->elem_count); } extern int sysctl_unprivileged_bpf_disabled; static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); } static inline bool bpf_allow_uninit_stack(void) { return perfmon_capable(); } static inline bool bpf_bypass_spec_v1(void) { return perfmon_capable(); } static inline bool bpf_bypass_spec_v4(void) { return perfmon_capable(); } int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } /* * The task type of iterators. * * For BPF task iterators, they can be parameterized with various * parameters to visit only some of tasks. * * BPF_TASK_ITER_ALL (default) * Iterate over resources of every task. * * BPF_TASK_ITER_TID * Iterate over resources of a task/tid. * * BPF_TASK_ITER_TGID * Iterate over resources of every task of a process / task group. */ enum bpf_iter_task_type { BPF_TASK_ITER_ALL = 0, BPF_TASK_ITER_TID, BPF_TASK_ITER_TGID, }; struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; /* for cgroup iter */ struct { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; struct { enum bpf_iter_task_type type; u32 pid; } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, struct seq_file *seq); typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); typedef const struct bpf_func_proto * (*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id, const struct bpf_prog *prog); enum bpf_iter_feature { BPF_ITER_RESCHED = BIT(0), }; #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; bpf_iter_get_func_proto_t get_func_proto; u32 ctx_arg_info_size; u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; struct bpf_iter_meta { __bpf_md_ptr(struct seq_file *, seq); u64 session_id; u64 seq_num; }; struct bpf_iter__bpf_map_elem { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(void *, value); }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq); int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) { return (attr->map_flags & BPF_F_NUMA_NODE) ? attr->numa_node : NUMA_NO_NODE; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); static inline bool bpf_tracing_ctx_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; return true; } static inline bool bpf_tracing_btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_tracing_ctx_access(off, size, type)) return false; return btf_ctx_access(off, size, type, prog, info); } int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, bool strict); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *func_proto, const char *func_name, struct btf_func_model *m); struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, u16 btf_fd_idx, u8 **func_addr); struct bpf_core_ctx { struct bpf_verifier_log *log; const struct btf *btf; }; bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, const char *field_name, u32 btf_id, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, const struct btf *arg_btf, u32 arg_id); int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); static inline bool unprivileged_ebpf_enabled(void) { return !sysctl_unprivileged_bpf_disabled; } /* Not all bpf prog type has the bpf_ctx. * For the bpf prog type that has initialized the bpf_ctx, * this function can be used to decide if a kernel function * is called by a bpf program. */ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_add(struct bpf_prog *prog, int i) { } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) { } static inline void bpf_prog_put(struct bpf_prog *prog) { } static inline void bpf_prog_inc(struct bpf_prog *prog) { } static inline struct bpf_prog *__must_check bpf_prog_inc_not_zero(struct bpf_prog *prog) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { } static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { return -EOPNOTSUPP; } static inline int bpf_link_settle(struct bpf_link_primer *primer) { return -EOPNOTSUPP; } static inline void bpf_link_cleanup(struct bpf_link_primer *primer) { } static inline void bpf_link_inc(struct bpf_link *link) { } static inline void bpf_link_put(struct bpf_link *link) { } static inline int bpf_obj_get_user(const char __user *pathname, int flags) { return -EOPNOTSUPP; } static inline void __dev_flush(void) { } struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; } struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { return 0; } static inline void __cpu_map_flush(void) { } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { return ERR_PTR(-EOPNOTSUPP); } static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline void bpf_map_put(struct bpf_map *map) { } static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } static inline int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { return -EACCES; } static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } static inline void bpf_task_storage_free(struct task_struct *task) { } static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) { return false; } static inline const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn) { return NULL; } static inline int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, u16 btf_fd_idx, u8 **func_addr) { return -ENOTSUPP; } static inline bool unprivileged_ebpf_enabled(void) { return false; } static inline bool has_current_bpf_ctx(void) { return false; } static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) { } static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { } static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { } static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = -EFAULT; if (IS_ENABLED(CONFIG_BPF_EVENTS)) ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } void __bpf_free_used_btfs(struct bpf_prog_aux *aux, struct btf_mod_pair *used_btfs, u32 len); static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { return bpf_prog_get_type_dev(ufd, type, false); } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len); bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_dev_bound_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); void unpriv_ebpf_notify(int new_state); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux); void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id); int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr); int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog); void bpf_dev_bound_netdev_unregister(struct net_device *dev); static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return aux->dev_bound; } static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux) { return aux->offload_requested; } bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs); static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return unlikely(map->ops == &bpf_map_offload_ops); } struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux) { return -EOPNOTSUPP; } static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) { return NULL; } static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) { return -EOPNOTSUPP; } static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) { return -EOPNOTSUPP; } static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev) { } static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return false; } static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux) { return false; } static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) { return false; } static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return false; } static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_map_offload_map_free(struct bpf_map *map) { } static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) { return 0; } static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } #ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EOPNOTSUPP; } static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { return -EOPNOTSUPP; } static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags); #else static inline void bpf_sk_reuseport_detach(struct sock *sk) { } #ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { return -EOPNOTSUPP; } static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EOPNOTSUPP; } #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_get_stackid_proto_pe; extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_spin_lock_proto; extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; extern const struct bpf_func_proto bpf_ringbuf_output_proto; extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_INET struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; struct sock *migrating_sk; void *data_end; u32 hash; u32 reuseport_id; bool bind_inany; }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); #else static inline bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } static inline bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } #endif /* CONFIG_INET */ enum bpf_text_poke_type { BPF_MOD_CALL, BPF_MOD_JUMP, }; int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 #define MAX_BPRINTF_BUF 1024 struct bpf_bprintf_data { u32 *bin_args; char *buf; bool get_bin_args; bool get_buf; }; int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); void bpf_cgroup_atype_put(int cgroup_atype); #else static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ struct key; #ifdef CONFIG_KEYS struct bpf_key { struct key *key; bool has_ref; }; #endif /* CONFIG_KEYS */ static inline bool type_is_alloc(u32 type) { return type & MEM_ALLOC; } static inline gfp_t bpf_memcg_flags(gfp_t flags) { if (memcg_bpf_enabled()) return flags | __GFP_ACCOUNT; return flags; } #endif /* _LINUX_BPF_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2023 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /** * DOC: Wireless regulatory infrastructure * * The usual implementation is for a driver to read a device EEPROM to * determine which regulatory domain it should be operating under, then * looking up the allowable channels in a driver-local table and finally * registering those channels in the wiphy structure. * * Another set of compliance enforcement is for drivers to use their * own compliance limits which can be stored on the EEPROM. The host * driver or firmware may ensure these are used. * * In addition to all this we provide an extra layer of regulatory * conformance. For drivers which do not have any regulatory * information CRDA provides the complete regulatory solution. * For others it provides a community effort on further restrictions * to enhance compliance. * * Note: When number of rules --> infinity we will not be able to * index on alpha2 any more, instead we'll probably have to * rely on some SHA1 checksum of the regdomain for example. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/nl80211.h> #include <linux/platform_device.h> #include <linux/verification.h> #include <linux/moduleparam.h> #include <linux/firmware.h> #include <net/cfg80211.h> #include "core.h" #include "reg.h" #include "rdev-ops.h" #include "nl80211.h" /* * Grace period we give before making sure all current interfaces reside on * channels allowed by the current regulatory domain. */ #define REG_ENFORCE_GRACE_MS 60000 /** * enum reg_request_treatment - regulatory request treatment * * @REG_REQ_OK: continue processing the regulatory request * @REG_REQ_IGNORE: ignore the regulatory request * @REG_REQ_INTERSECT: the regulatory domain resulting from this request should * be intersected with the current one. * @REG_REQ_ALREADY_SET: the regulatory request will not change the current * regulatory settings, and no further processing is required. */ enum reg_request_treatment { REG_REQ_OK, REG_REQ_IGNORE, REG_REQ_INTERSECT, REG_REQ_ALREADY_SET, }; static struct regulatory_request core_request_world = { .initiator = NL80211_REGDOM_SET_BY_CORE, .alpha2[0] = '0', .alpha2[1] = '0', .intersect = false, .processed = true, .country_ie_env = ENVIRON_ANY, }; /* * Receipt of information from last regulatory request, * protected by RTNL (and can be accessed with RCU protection) */ static struct regulatory_request __rcu *last_request = (void __force __rcu *)&core_request_world; /* To trigger userspace events and load firmware */ static struct platform_device *reg_pdev; /* * Central wireless core regulatory domains, we only need two, * the current one and a world regulatory domain in case we have no * information to give us an alpha2. * (protected by RTNL, can be read under RCU) */ const struct ieee80211_regdomain __rcu *cfg80211_regdomain; /* * Number of devices that registered to the core * that support cellular base station regulatory hints * (protected by RTNL) */ static int reg_num_devs_support_basehint; /* * State variable indicating if the platform on which the devices * are attached is operating in an indoor environment. The state variable * is relevant for all registered devices. */ static bool reg_is_indoor; static DEFINE_SPINLOCK(reg_indoor_lock); /* Used to track the userspace process controlling the indoor setting */ static u32 reg_is_indoor_portid; static void restore_regulatory_settings(bool reset_user, bool cached); static void print_regdomain(const struct ieee80211_regdomain *rd); static void reg_process_hint(struct regulatory_request *reg_request); static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { return rcu_dereference_rtnl(cfg80211_regdomain); } /* * Returns the regulatory domain associated with the wiphy. * * Requires any of RTNL, wiphy mutex or RCU protection. */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { return rcu_dereference_check(wiphy->regd, lockdep_is_held(&wiphy->mtx) || lockdep_rtnl_is_held()); } EXPORT_SYMBOL(get_wiphy_regdom); static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) { switch (dfs_region) { case NL80211_DFS_UNSET: return "unset"; case NL80211_DFS_FCC: return "FCC"; case NL80211_DFS_ETSI: return "ETSI"; case NL80211_DFS_JP: return "JP"; } return "Unknown"; } enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) { const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; enum nl80211_dfs_regions dfs_region; rcu_read_lock(); regd = get_cfg80211_regdom(); dfs_region = regd->dfs_region; if (!wiphy) goto out; wiphy_regd = get_wiphy_regdom(wiphy); if (!wiphy_regd) goto out; if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { dfs_region = wiphy_regd->dfs_region; goto out; } if (wiphy_regd->dfs_region == regd->dfs_region) goto out; pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n", dev_name(&wiphy->dev), reg_dfs_region_str(wiphy_regd->dfs_region), reg_dfs_region_str(regd->dfs_region)); out: rcu_read_unlock(); return dfs_region; } static void rcu_free_regdom(const struct ieee80211_regdomain *r) { if (!r) return; kfree_rcu((struct ieee80211_regdomain *)r, rcu_head); } static struct regulatory_request *get_last_request(void) { return rcu_dereference_rtnl(last_request); } /* Used to queue up regulatory hints */ static LIST_HEAD(reg_requests_list); static DEFINE_SPINLOCK(reg_requests_lock); /* Used to queue up beacon hints for review */ static LIST_HEAD(reg_pending_beacons); static DEFINE_SPINLOCK(reg_pending_beacons_lock); /* Used to keep track of processed beacon hints */ static LIST_HEAD(reg_beacon_list); struct reg_beacon { struct list_head list; struct ieee80211_channel chan; }; static void reg_check_chans_work(struct work_struct *work); static DECLARE_DELAYED_WORK(reg_check_chans, reg_check_chans_work); static void reg_todo(struct work_struct *work); static DECLARE_WORK(reg_work, reg_todo); /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { .n_reg_rules = 8, .alpha2 = "00", .reg_rules = { /* IEEE 802.11b/g, channels 1..11 */ REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), /* IEEE 802.11b/g, channels 12..13. */ REG_RULE(2467-10, 2472+10, 20, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW), /* IEEE 802.11 channel 14 - Only JP enables * this and for 802.11b only */ REG_RULE(2484-10, 2484+10, 20, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_NO_OFDM), /* IEEE 802.11a, channel 36..48 */ REG_RULE(5180-10, 5240+10, 80, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW), /* IEEE 802.11a, channel 52..64 - DFS required */ REG_RULE(5260-10, 5320+10, 80, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW | NL80211_RRF_DFS), /* IEEE 802.11a, channel 100..144 - DFS required */ REG_RULE(5500-10, 5720+10, 160, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_DFS), /* IEEE 802.11a, channel 149..165 */ REG_RULE(5745-10, 5825+10, 80, 6, 20, NL80211_RRF_NO_IR), /* IEEE 802.11ad (60GHz), channels 1..3 */ REG_RULE(56160+2160*1-1080, 56160+2160*3+1080, 2160, 0, 0, 0), } }; /* protected by RTNL */ static const struct ieee80211_regdomain *cfg80211_world_regdom = &world_regdom; static char *ieee80211_regdom = "00"; static char user_alpha2[2]; static const struct ieee80211_regdomain *cfg80211_user_regdom; module_param(ieee80211_regdom, charp, 0444); MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); static void reg_free_request(struct regulatory_request *request) { if (request == &core_request_world) return; if (request != get_last_request()) kfree(request); } static void reg_free_last_request(void) { struct regulatory_request *lr = get_last_request(); if (lr != &core_request_world && lr) kfree_rcu(lr, rcu_head); } static void reg_update_last_request(struct regulatory_request *request) { struct regulatory_request *lr; lr = get_last_request(); if (lr == request) return; reg_free_last_request(); rcu_assign_pointer(last_request, request); } static void reset_regdomains(bool full_reset, const struct ieee80211_regdomain *new_regdom) { const struct ieee80211_regdomain *r; ASSERT_RTNL(); r = get_cfg80211_regdom(); /* avoid freeing static information or freeing something twice */ if (r == cfg80211_world_regdom) r = NULL; if (cfg80211_world_regdom == &world_regdom) cfg80211_world_regdom = NULL; if (r == &world_regdom) r = NULL; rcu_free_regdom(r); rcu_free_regdom(cfg80211_world_regdom); cfg80211_world_regdom = &world_regdom; rcu_assign_pointer(cfg80211_regdomain, new_regdom); if (!full_reset) return; reg_update_last_request(&core_request_world); } /* * Dynamic world regulatory domain requested by the wireless * core upon initialization */ static void update_world_regdomain(const struct ieee80211_regdomain *rd) { struct regulatory_request *lr; lr = get_last_request(); WARN_ON(!lr); reset_regdomains(false, rd); cfg80211_world_regdom = rd; } bool is_world_regdom(const char *alpha2) { if (!alpha2) return false; return alpha2[0] == '0' && alpha2[1] == '0'; } static bool is_alpha2_set(const char *alpha2) { if (!alpha2) return false; return alpha2[0] && alpha2[1]; } static bool is_unknown_alpha2(const char *alpha2) { if (!alpha2) return false; /* * Special case where regulatory domain was built by driver * but a specific alpha2 cannot be determined */ return alpha2[0] == '9' && alpha2[1] == '9'; } static bool is_intersected_alpha2(const char *alpha2) { if (!alpha2) return false; /* * Special case where regulatory domain is the * result of an intersection between two regulatory domain * structures */ return alpha2[0] == '9' && alpha2[1] == '8'; } static bool is_an_alpha2(const char *alpha2) { if (!alpha2) return false; return isalpha(alpha2[0]) && isalpha(alpha2[1]); } static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) { if (!alpha2_x || !alpha2_y) return false; return alpha2_x[0] == alpha2_y[0] && alpha2_x[1] == alpha2_y[1]; } static bool regdom_changes(const char *alpha2) { const struct ieee80211_regdomain *r = get_cfg80211_regdom(); if (!r) return true; return !alpha2_equal(r->alpha2, alpha2); } /* * The NL80211_REGDOM_SET_BY_USER regdom alpha2 is cached, this lets * you know if a valid regulatory hint with NL80211_REGDOM_SET_BY_USER * has ever been issued. */ static bool is_user_regdom_saved(void) { if (user_alpha2[0] == '9' && user_alpha2[1] == '7') return false; /* This would indicate a mistake on the design */ if (WARN(!is_world_regdom(user_alpha2) && !is_an_alpha2(user_alpha2), "Unexpected user alpha2: %c%c\n", user_alpha2[0], user_alpha2[1])) return false; return true; } static const struct ieee80211_regdomain * reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; unsigned int i; regd = kzalloc(struct_size(regd, reg_rules, src_regd->n_reg_rules), GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); for (i = 0; i < src_regd->n_reg_rules; i++) memcpy(&regd->reg_rules[i], &src_regd->reg_rules[i], sizeof(struct ieee80211_reg_rule)); return regd; } static void cfg80211_save_user_regdom(const struct ieee80211_regdomain *rd) { ASSERT_RTNL(); if (!IS_ERR(cfg80211_user_regdom)) kfree(cfg80211_user_regdom); cfg80211_user_regdom = reg_copy_regd(rd); } struct reg_regdb_apply_request { struct list_head list; const struct ieee80211_regdomain *regdom; }; static LIST_HEAD(reg_regdb_apply_list); static DEFINE_MUTEX(reg_regdb_apply_mutex); static void reg_regdb_apply(struct work_struct *work) { struct reg_regdb_apply_request *request; rtnl_lock(); mutex_lock(&reg_regdb_apply_mutex); while (!list_empty(&reg_regdb_apply_list)) { request = list_first_entry(&reg_regdb_apply_list, struct reg_regdb_apply_request, list); list_del(&request->list); set_regdom(request->regdom, REGD_SOURCE_INTERNAL_DB); kfree(request); } mutex_unlock(&reg_regdb_apply_mutex); rtnl_unlock(); } static DECLARE_WORK(reg_regdb_work, reg_regdb_apply); static int reg_schedule_apply(const struct ieee80211_regdomain *regdom) { struct reg_regdb_apply_request *request; request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL); if (!request) { kfree(regdom); return -ENOMEM; } request->regdom = regdom; mutex_lock(&reg_regdb_apply_mutex); list_add_tail(&request->list, &reg_regdb_apply_list); mutex_unlock(&reg_regdb_apply_mutex); schedule_work(&reg_regdb_work); return 0; } #ifdef CONFIG_CFG80211_CRDA_SUPPORT /* Max number of consecutive attempts to communicate with CRDA */ #define REG_MAX_CRDA_TIMEOUTS 10 static u32 reg_crda_timeouts; static void crda_timeout_work(struct work_struct *work); static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work); static void crda_timeout_work(struct work_struct *work) { pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); rtnl_lock(); reg_crda_timeouts++; restore_regulatory_settings(true, false); rtnl_unlock(); } static void cancel_crda_timeout(void) { cancel_delayed_work(&crda_timeout); } static void cancel_crda_timeout_sync(void) { cancel_delayed_work_sync(&crda_timeout); } static void reset_crda_timeouts(void) { reg_crda_timeouts = 0; } /* * This lets us keep regulatory code which is updated on a regulatory * basis in userspace. */ static int call_crda(const char *alpha2) { char country[12]; char *env[] = { country, NULL }; int ret; snprintf(country, sizeof(country), "COUNTRY=%c%c", alpha2[0], alpha2[1]); if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) { pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n"); return -EINVAL; } if (!is_world_regdom((char *) alpha2)) pr_debug("Calling CRDA for country: %c%c\n", alpha2[0], alpha2[1]); else pr_debug("Calling CRDA to update world regulatory domain\n"); ret = kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, env); if (ret) return ret; queue_delayed_work(system_power_efficient_wq, &crda_timeout, msecs_to_jiffies(3142)); return 0; } #else static inline void cancel_crda_timeout(void) {} static inline void cancel_crda_timeout_sync(void) {} static inline void reset_crda_timeouts(void) {} static inline int call_crda(const char *alpha2) { return -ENODATA; } #endif /* CONFIG_CFG80211_CRDA_SUPPORT */ /* code to directly load a firmware database through request_firmware */ static const struct fwdb_header *regdb; struct fwdb_country { u8 alpha2[2]; __be16 coll_ptr; /* this struct cannot be extended */ } __packed __aligned(4); struct fwdb_collection { u8 len; u8 n_rules; u8 dfs_region; /* no optional data yet */ /* aligned to 2, then followed by __be16 array of rule pointers */ } __packed __aligned(4); enum fwdb_flags { FWDB_FLAG_NO_OFDM = BIT(0), FWDB_FLAG_NO_OUTDOOR = BIT(1), FWDB_FLAG_DFS = BIT(2), FWDB_FLAG_NO_IR = BIT(3), FWDB_FLAG_AUTO_BW = BIT(4), }; struct fwdb_wmm_ac { u8 ecw; u8 aifsn; __be16 cot; } __packed; struct fwdb_wmm_rule { struct fwdb_wmm_ac client[IEEE80211_NUM_ACS]; struct fwdb_wmm_ac ap[IEEE80211_NUM_ACS]; } __packed; struct fwdb_rule { u8 len; u8 flags; __be16 max_eirp; __be32 start, end, max_bw; /* start of optional data */ __be16 cac_timeout; __be16 wmm_ptr; } __packed __aligned(4); #define FWDB_MAGIC 0x52474442 #define FWDB_VERSION 20 struct fwdb_header { __be32 magic; __be32 version; struct fwdb_country country[]; } __packed __aligned(4); static int ecw2cw(int ecw) { return (1 << ecw) - 1; } static bool valid_wmm(struct fwdb_wmm_rule *rule) { struct fwdb_wmm_ac *ac = (struct fwdb_wmm_ac *)rule; int i; for (i = 0; i < IEEE80211_NUM_ACS * 2; i++) { u16 cw_min = ecw2cw((ac[i].ecw & 0xf0) >> 4); u16 cw_max = ecw2cw(ac[i].ecw & 0x0f); u8 aifsn = ac[i].aifsn; if (cw_min >= cw_max) return false; if (aifsn < 1) return false; } return true; } static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) { struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2)); if ((u8 *)rule + sizeof(rule->len) > data + size) return false; /* mandatory fields */ if (rule->len < offsetofend(struct fwdb_rule, max_bw)) return false; if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; struct fwdb_wmm_rule *wmm; if (wmm_ptr + sizeof(struct fwdb_wmm_rule) > size) return false; wmm = (void *)(data + wmm_ptr); if (!valid_wmm(wmm)) return false; } return true; } static bool valid_country(const u8 *data, unsigned int size, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)(data + ptr); __be16 *rules_ptr; unsigned int i; /* make sure we can read len/n_rules */ if ((u8 *)coll + offsetofend(typeof(*coll), n_rules) > data + size) return false; /* make sure base struct and all rules fit */ if ((u8 *)coll + ALIGN(coll->len, 2) + (coll->n_rules * 2) > data + size) return false; /* mandatory fields must exist */ if (coll->len < offsetofend(struct fwdb_collection, dfs_region)) return false; rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); for (i = 0; i < coll->n_rules; i++) { u16 rule_ptr = be16_to_cpu(rules_ptr[i]); if (!valid_rule(data, size, rule_ptr)) return false; } return true; } #ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB #include <keys/asymmetric-type.h> static struct key *builtin_regdb_keys; static int __init load_builtin_regdb_keys(void) { builtin_regdb_keys = keyring_alloc(".builtin_regdb_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_regdb_keys)) return PTR_ERR(builtin_regdb_keys); pr_notice("Loading compiled-in X.509 certificates for regulatory database\n"); #ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS x509_load_certificate_list(shipped_regdb_certs, shipped_regdb_certs_len, builtin_regdb_keys); #endif #ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') x509_load_certificate_list(extra_regdb_certs, extra_regdb_certs_len, builtin_regdb_keys); #endif return 0; } MODULE_FIRMWARE("regulatory.db.p7s"); static bool regdb_has_valid_signature(const u8 *data, unsigned int size) { const struct firmware *sig; bool result; if (request_firmware(&sig, "regulatory.db.p7s", &reg_pdev->dev)) return false; result = verify_pkcs7_signature(data, size, sig->data, sig->size, builtin_regdb_keys, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL) == 0; release_firmware(sig); return result; } static void free_regdb_keyring(void) { key_put(builtin_regdb_keys); } #else static int load_builtin_regdb_keys(void) { return 0; } static bool regdb_has_valid_signature(const u8 *data, unsigned int size) { return true; } static void free_regdb_keyring(void) { } #endif /* CONFIG_CFG80211_REQUIRE_SIGNED_REGDB */ static bool valid_regdb(const u8 *data, unsigned int size) { const struct fwdb_header *hdr = (void *)data; const struct fwdb_country *country; if (size < sizeof(*hdr)) return false; if (hdr->magic != cpu_to_be32(FWDB_MAGIC)) return false; if (hdr->version != cpu_to_be32(FWDB_VERSION)) return false; if (!regdb_has_valid_signature(data, size)) return false; country = &hdr->country[0]; while ((u8 *)(country + 1) <= data + size) { if (!country->coll_ptr) break; if (!valid_country(data, size, country)) return false; country++; } return true; } static void set_wmm_rule(const struct fwdb_header *db, const struct fwdb_country *country, const struct fwdb_rule *rule, struct ieee80211_reg_rule *rrule) { struct ieee80211_wmm_rule *wmm_rule = &rrule->wmm_rule; struct fwdb_wmm_rule *wmm; unsigned int i, wmm_ptr; wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; wmm = (void *)((u8 *)db + wmm_ptr); if (!valid_wmm(wmm)) { pr_err("Invalid regulatory WMM rule %u-%u in domain %c%c\n", be32_to_cpu(rule->start), be32_to_cpu(rule->end), country->alpha2[0], country->alpha2[1]); return; } for (i = 0; i < IEEE80211_NUM_ACS; i++) { wmm_rule->client[i].cw_min = ecw2cw((wmm->client[i].ecw & 0xf0) >> 4); wmm_rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); wmm_rule->client[i].aifsn = wmm->client[i].aifsn; wmm_rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot); wmm_rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); wmm_rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); wmm_rule->ap[i].aifsn = wmm->ap[i].aifsn; wmm_rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); } rrule->has_wmm = true; } static int __regdb_query_wmm(const struct fwdb_header *db, const struct fwdb_country *country, int freq, struct ieee80211_reg_rule *rrule) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); int i; for (i = 0; i < coll->n_rules; i++) { __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); if (rule->len < offsetofend(struct fwdb_rule, wmm_ptr)) continue; if (freq >= KHZ_TO_MHZ(be32_to_cpu(rule->start)) && freq <= KHZ_TO_MHZ(be32_to_cpu(rule->end))) { set_wmm_rule(db, country, rule, rrule); return 0; } } return -ENODATA; } int reg_query_regdb_wmm(char *alpha2, int freq, struct ieee80211_reg_rule *rule) { const struct fwdb_header *hdr = regdb; const struct fwdb_country *country; if (!regdb) return -ENODATA; if (IS_ERR(regdb)) return PTR_ERR(regdb); country = &hdr->country[0]; while (country->coll_ptr) { if (alpha2_equal(alpha2, country->alpha2)) return __regdb_query_wmm(regdb, country, freq, rule); country++; } return -ENODATA; } EXPORT_SYMBOL(reg_query_regdb_wmm); static int regdb_query_country(const struct fwdb_header *db, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); struct ieee80211_regdomain *regdom; unsigned int i; regdom = kzalloc(struct_size(regdom, reg_rules, coll->n_rules), GFP_KERNEL); if (!regdom) return -ENOMEM; regdom->n_reg_rules = coll->n_rules; regdom->alpha2[0] = country->alpha2[0]; regdom->alpha2[1] = country->alpha2[1]; regdom->dfs_region = coll->dfs_region; for (i = 0; i < regdom->n_reg_rules; i++) { __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); struct ieee80211_reg_rule *rrule = &regdom->reg_rules[i]; rrule->freq_range.start_freq_khz = be32_to_cpu(rule->start); rrule->freq_range.end_freq_khz = be32_to_cpu(rule->end); rrule->freq_range.max_bandwidth_khz = be32_to_cpu(rule->max_bw); rrule->power_rule.max_antenna_gain = 0; rrule->power_rule.max_eirp = be16_to_cpu(rule->max_eirp); rrule->flags = 0; if (rule->flags & FWDB_FLAG_NO_OFDM) rrule->flags |= NL80211_RRF_NO_OFDM; if (rule->flags & FWDB_FLAG_NO_OUTDOOR) rrule->flags |= NL80211_RRF_NO_OUTDOOR; if (rule->flags & FWDB_FLAG_DFS) rrule->flags |= NL80211_RRF_DFS; if (rule->flags & FWDB_FLAG_NO_IR) rrule->flags |= NL80211_RRF_NO_IR; if (rule->flags & FWDB_FLAG_AUTO_BW) rrule->flags |= NL80211_RRF_AUTO_BW; rrule->dfs_cac_ms = 0; /* handle optional data */ if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) rrule->dfs_cac_ms = 1000 * be16_to_cpu(rule->cac_timeout); if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) set_wmm_rule(db, country, rule, rrule); } return reg_schedule_apply(regdom); } static int query_regdb(const char *alpha2) { const struct fwdb_header *hdr = regdb; const struct fwdb_country *country; ASSERT_RTNL(); if (IS_ERR(regdb)) return PTR_ERR(regdb); country = &hdr->country[0]; while (country->coll_ptr) { if (alpha2_equal(alpha2, country->alpha2)) return regdb_query_country(regdb, country); country++; } return -ENODATA; } static void regdb_fw_cb(const struct firmware *fw, void *context) { int set_error = 0; bool restore = true; void *db; if (!fw) { pr_info("failed to load regulatory.db\n"); set_error = -ENODATA; } else if (!valid_regdb(fw->data, fw->size)) { pr_info("loaded regulatory.db is malformed or signature is missing/invalid\n"); set_error = -EINVAL; } rtnl_lock(); if (regdb && !IS_ERR(regdb)) { /* negative case - a bug * positive case - can happen due to race in case of multiple cb's in * queue, due to usage of asynchronous callback * * Either case, just restore and free new db. */ } else if (set_error) { regdb = ERR_PTR(set_error); } else if (fw) { db = kmemdup(fw->data, fw->size, GFP_KERNEL); if (db) { regdb = db; restore = context && query_regdb(context); } else { restore = true; } } if (restore) restore_regulatory_settings(true, false); rtnl_unlock(); kfree(context); release_firmware(fw); } MODULE_FIRMWARE("regulatory.db"); static int query_regdb_file(const char *alpha2) { int err; ASSERT_RTNL(); if (regdb) return query_regdb(alpha2); alpha2 = kmemdup(alpha2, 2, GFP_KERNEL); if (!alpha2) return -ENOMEM; err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db", &reg_pdev->dev, GFP_KERNEL, (void *)alpha2, regdb_fw_cb); if (err) kfree(alpha2); return err; } int reg_reload_regdb(void) { const struct firmware *fw; void *db; int err; const struct ieee80211_regdomain *current_regdomain; struct regulatory_request *request; err = request_firmware(&fw, "regulatory.db", &reg_pdev->dev); if (err) return err; if (!valid_regdb(fw->data, fw->size)) { err = -ENODATA; goto out; } db = kmemdup(fw->data, fw->size, GFP_KERNEL); if (!db) { err = -ENOMEM; goto out; } rtnl_lock(); if (!IS_ERR_OR_NULL(regdb)) kfree(regdb); regdb = db; /* reset regulatory domain */ current_regdomain = get_cfg80211_regdom(); request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) { err = -ENOMEM; goto out_unlock; } request->wiphy_idx = WIPHY_IDX_INVALID; request->alpha2[0] = current_regdomain->alpha2[0]; request->alpha2[1] = current_regdomain->alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; request->user_reg_hint_type = NL80211_USER_REG_HINT_USER; reg_process_hint(request); out_unlock: rtnl_unlock(); out: release_firmware(fw); return err; } static bool reg_query_database(struct regulatory_request *request) { if (query_regdb_file(request->alpha2) == 0) return true; if (call_crda(request->alpha2) == 0) return true; return false; } bool reg_is_valid_request(const char *alpha2) { struct regulatory_request *lr = get_last_request(); if (!lr || lr->processed) return false; return alpha2_equal(lr->alpha2, alpha2); } static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy) { struct regulatory_request *lr = get_last_request(); /* * Follow the driver's regulatory domain, if present, unless a country * IE has been processed or a user wants to help complaince further */ if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && lr->initiator != NL80211_REGDOM_SET_BY_USER && wiphy->regd) return get_wiphy_regdom(wiphy); return get_cfg80211_regdom(); } static unsigned int reg_get_max_bandwidth_from_range(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule) { const struct ieee80211_freq_range *freq_range = &rule->freq_range; const struct ieee80211_freq_range *freq_range_tmp; const struct ieee80211_reg_rule *tmp; u32 start_freq, end_freq, idx, no; for (idx = 0; idx < rd->n_reg_rules; idx++) if (rule == &rd->reg_rules[idx]) break; if (idx == rd->n_reg_rules) return 0; /* get start_freq */ no = idx; while (no) { tmp = &rd->reg_rules[--no]; freq_range_tmp = &tmp->freq_range; if (freq_range_tmp->end_freq_khz < freq_range->start_freq_khz) break; freq_range = freq_range_tmp; } start_freq = freq_range->start_freq_khz; /* get end_freq */ freq_range = &rule->freq_range; no = idx; while (no < rd->n_reg_rules - 1) { tmp = &rd->reg_rules[++no]; freq_range_tmp = &tmp->freq_range; if (freq_range_tmp->start_freq_khz > freq_range->end_freq_khz) break; freq_range = freq_range_tmp; } end_freq = freq_range->end_freq_khz; return end_freq - start_freq; } unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule) { unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule); if (rule->flags & NL80211_RRF_NO_320MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(160)); if (rule->flags & NL80211_RRF_NO_160MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80)); if (rule->flags & NL80211_RRF_NO_80MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(40)); /* * HT40+/HT40- limits are handled per-channel. Only limit BW if both * are not allowed. */ if (rule->flags & NL80211_RRF_NO_HT40MINUS && rule->flags & NL80211_RRF_NO_HT40PLUS) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(20)); return bw; } /* Sanity check on a regulatory rule */ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) { const struct ieee80211_freq_range *freq_range = &rule->freq_range; u32 freq_diff; if (freq_range->start_freq_khz <= 0 || freq_range->end_freq_khz <= 0) return false; if (freq_range->start_freq_khz > freq_range->end_freq_khz) return false; freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->end_freq_khz <= freq_range->start_freq_khz || freq_range->max_bandwidth_khz > freq_diff) return false; return true; } static bool is_valid_rd(const struct ieee80211_regdomain *rd) { const struct ieee80211_reg_rule *reg_rule = NULL; unsigned int i; if (!rd->n_reg_rules) return false; if (WARN_ON(rd->n_reg_rules > NL80211_MAX_SUPP_REG_RULES)) return false; for (i = 0; i < rd->n_reg_rules; i++) { reg_rule = &rd->reg_rules[i]; if (!is_valid_reg_rule(reg_rule)) return false; } return true; } /** * freq_in_rule_band - tells us if a frequency is in a frequency band * @freq_range: frequency rule we want to query * @freq_khz: frequency we are inquiring about * * This lets us know if a specific frequency rule is or is not relevant to * a specific frequency's band. Bands are device specific and artificial * definitions (the "2.4 GHz band", the "5 GHz band" and the "60GHz band"), * however it is safe for now to assume that a frequency rule should not be * part of a frequency's band if the start freq or end freq are off by more * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 20 GHz for the * 60 GHz band. * This resolution can be lowered and should be considered as we add * regulatory rule support for other "bands". **/ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, u32 freq_khz) { #define ONE_GHZ_IN_KHZ 1000000 /* * From 802.11ad: directional multi-gigabit (DMG): * Pertaining to operation in a frequency band containing a channel * with the Channel starting frequency above 45 GHz. */ u32 limit = freq_khz > 45 * ONE_GHZ_IN_KHZ ? 20 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ; if (abs(freq_khz - freq_range->start_freq_khz) <= limit) return true; if (abs(freq_khz - freq_range->end_freq_khz) <= limit) return true; return false; #undef ONE_GHZ_IN_KHZ } /* * Later on we can perhaps use the more restrictive DFS * region but we don't have information for that yet so * for now simply disallow conflicts. */ static enum nl80211_dfs_regions reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1, const enum nl80211_dfs_regions dfs_region2) { if (dfs_region1 != dfs_region2) return NL80211_DFS_UNSET; return dfs_region1; } static void reg_wmm_rules_intersect(const struct ieee80211_wmm_ac *wmm_ac1, const struct ieee80211_wmm_ac *wmm_ac2, struct ieee80211_wmm_ac *intersect) { intersect->cw_min = max_t(u16, wmm_ac1->cw_min, wmm_ac2->cw_min); intersect->cw_max = max_t(u16, wmm_ac1->cw_max, wmm_ac2->cw_max); intersect->cot = min_t(u16, wmm_ac1->cot, wmm_ac2->cot); intersect->aifsn = max_t(u8, wmm_ac1->aifsn, wmm_ac2->aifsn); } /* * Helper for regdom_intersect(), this does the real * mathematical intersection fun */ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, const struct ieee80211_regdomain *rd2, const struct ieee80211_reg_rule *rule1, const struct ieee80211_reg_rule *rule2, struct ieee80211_reg_rule *intersected_rule) { const struct ieee80211_freq_range *freq_range1, *freq_range2; struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule1, *power_rule2; struct ieee80211_power_rule *power_rule; const struct ieee80211_wmm_rule *wmm_rule1, *wmm_rule2; struct ieee80211_wmm_rule *wmm_rule; u32 freq_diff, max_bandwidth1, max_bandwidth2; freq_range1 = &rule1->freq_range; freq_range2 = &rule2->freq_range; freq_range = &intersected_rule->freq_range; power_rule1 = &rule1->power_rule; power_rule2 = &rule2->power_rule; power_rule = &intersected_rule->power_rule; wmm_rule1 = &rule1->wmm_rule; wmm_rule2 = &rule2->wmm_rule; wmm_rule = &intersected_rule->wmm_rule; freq_range->start_freq_khz = max(freq_range1->start_freq_khz, freq_range2->start_freq_khz); freq_range->end_freq_khz = min(freq_range1->end_freq_khz, freq_range2->end_freq_khz); max_bandwidth1 = freq_range1->max_bandwidth_khz; max_bandwidth2 = freq_range2->max_bandwidth_khz; if (rule1->flags & NL80211_RRF_AUTO_BW) max_bandwidth1 = reg_get_max_bandwidth(rd1, rule1); if (rule2->flags & NL80211_RRF_AUTO_BW) max_bandwidth2 = reg_get_max_bandwidth(rd2, rule2); freq_range->max_bandwidth_khz = min(max_bandwidth1, max_bandwidth2); intersected_rule->flags = rule1->flags | rule2->flags; /* * In case NL80211_RRF_AUTO_BW requested for both rules * set AUTO_BW in intersected rule also. Next we will * calculate BW correctly in handle_channel function. * In other case remove AUTO_BW flag while we calculate * maximum bandwidth correctly and auto calculation is * not required. */ if ((rule1->flags & NL80211_RRF_AUTO_BW) && (rule2->flags & NL80211_RRF_AUTO_BW)) intersected_rule->flags |= NL80211_RRF_AUTO_BW; else intersected_rule->flags &= ~NL80211_RRF_AUTO_BW; freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->max_bandwidth_khz > freq_diff) freq_range->max_bandwidth_khz = freq_diff; power_rule->max_eirp = min(power_rule1->max_eirp, power_rule2->max_eirp); power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain, power_rule2->max_antenna_gain); intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms, rule2->dfs_cac_ms); if (rule1->has_wmm && rule2->has_wmm) { u8 ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { reg_wmm_rules_intersect(&wmm_rule1->client[ac], &wmm_rule2->client[ac], &wmm_rule->client[ac]); reg_wmm_rules_intersect(&wmm_rule1->ap[ac], &wmm_rule2->ap[ac], &wmm_rule->ap[ac]); } intersected_rule->has_wmm = true; } else if (rule1->has_wmm) { *wmm_rule = *wmm_rule1; intersected_rule->has_wmm = true; } else if (rule2->has_wmm) { *wmm_rule = *wmm_rule2; intersected_rule->has_wmm = true; } else { intersected_rule->has_wmm = false; } if (!is_valid_reg_rule(intersected_rule)) return -EINVAL; return 0; } /* check whether old rule contains new rule */ static bool rule_contains(struct ieee80211_reg_rule *r1, struct ieee80211_reg_rule *r2) { /* for simplicity, currently consider only same flags */ if (r1->flags != r2->flags) return false; /* verify r1 is more restrictive */ if ((r1->power_rule.max_antenna_gain > r2->power_rule.max_antenna_gain) || r1->power_rule.max_eirp > r2->power_rule.max_eirp) return false; /* make sure r2's range is contained within r1 */ if (r1->freq_range.start_freq_khz > r2->freq_range.start_freq_khz || r1->freq_range.end_freq_khz < r2->freq_range.end_freq_khz) return false; /* and finally verify that r1.max_bw >= r2.max_bw */ if (r1->freq_range.max_bandwidth_khz < r2->freq_range.max_bandwidth_khz) return false; return true; } /* add or extend current rules. do nothing if rule is already contained */ static void add_rule(struct ieee80211_reg_rule *rule, struct ieee80211_reg_rule *reg_rules, u32 *n_rules) { struct ieee80211_reg_rule *tmp_rule; int i; for (i = 0; i < *n_rules; i++) { tmp_rule = &reg_rules[i]; /* rule is already contained - do nothing */ if (rule_contains(tmp_rule, rule)) return; /* extend rule if possible */ if (rule_contains(rule, tmp_rule)) { memcpy(tmp_rule, rule, sizeof(*rule)); return; } } memcpy(&reg_rules[*n_rules], rule, sizeof(*rule)); (*n_rules)++; } /** * regdom_intersect - do the intersection between two regulatory domains * @rd1: first regulatory domain * @rd2: second regulatory domain * * Use this function to get the intersection between two regulatory domains. * Once completed we will mark the alpha2 for the rd as intersected, "98", * as no one single alpha2 can represent this regulatory domain. * * Returns a pointer to the regulatory domain structure which will hold the * resulting intersection of rules between rd1 and rd2. We will * kzalloc() this structure for you. */ static struct ieee80211_regdomain * regdom_intersect(const struct ieee80211_regdomain *rd1, const struct ieee80211_regdomain *rd2) { int r; unsigned int x, y; unsigned int num_rules = 0; const struct ieee80211_reg_rule *rule1, *rule2; struct ieee80211_reg_rule intersected_rule; struct ieee80211_regdomain *rd; if (!rd1 || !rd2) return NULL; /* * First we get a count of the rules we'll need, then we actually * build them. This is to so we can malloc() and free() a * regdomain once. The reason we use reg_rules_intersect() here * is it will return -EINVAL if the rule computed makes no sense. * All rules that do check out OK are valid. */ for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; if (!reg_rules_intersect(rd1, rd2, rule1, rule2, &intersected_rule)) num_rules++; } } if (!num_rules) return NULL; rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); if (!rd) return NULL; for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; r = reg_rules_intersect(rd1, rd2, rule1, rule2, &intersected_rule); /* * No need to memset here the intersected rule here as * we're not using the stack anymore */ if (r) continue; add_rule(&intersected_rule, rd->reg_rules, &rd->n_reg_rules); } } rd->alpha2[0] = '9'; rd->alpha2[1] = '8'; rd->dfs_region = reg_intersect_dfs_region(rd1->dfs_region, rd2->dfs_region); return rd; } /* * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may * want to just have the channel structure use these */ static u32 map_regdom_flags(u32 rd_flags) { u32 channel_flags = 0; if (rd_flags & NL80211_RRF_NO_IR_ALL) channel_flags |= IEEE80211_CHAN_NO_IR; if (rd_flags & NL80211_RRF_DFS) channel_flags |= IEEE80211_CHAN_RADAR; if (rd_flags & NL80211_RRF_NO_OFDM) channel_flags |= IEEE80211_CHAN_NO_OFDM; if (rd_flags & NL80211_RRF_NO_OUTDOOR) channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; if (rd_flags & NL80211_RRF_IR_CONCURRENT) channel_flags |= IEEE80211_CHAN_IR_CONCURRENT; if (rd_flags & NL80211_RRF_NO_HT40MINUS) channel_flags |= IEEE80211_CHAN_NO_HT40MINUS; if (rd_flags & NL80211_RRF_NO_HT40PLUS) channel_flags |= IEEE80211_CHAN_NO_HT40PLUS; if (rd_flags & NL80211_RRF_NO_80MHZ) channel_flags |= IEEE80211_CHAN_NO_80MHZ; if (rd_flags & NL80211_RRF_NO_160MHZ) channel_flags |= IEEE80211_CHAN_NO_160MHZ; if (rd_flags & NL80211_RRF_NO_HE) channel_flags |= IEEE80211_CHAN_NO_HE; if (rd_flags & NL80211_RRF_NO_320MHZ) channel_flags |= IEEE80211_CHAN_NO_320MHZ; if (rd_flags & NL80211_RRF_NO_EHT) channel_flags |= IEEE80211_CHAN_NO_EHT; return channel_flags; } static const struct ieee80211_reg_rule * freq_reg_info_regd(u32 center_freq, const struct ieee80211_regdomain *regd, u32 bw) { int i; bool band_rule_found = false; bool bw_fits = false; if (!regd) return ERR_PTR(-EINVAL); for (i = 0; i < regd->n_reg_rules; i++) { const struct ieee80211_reg_rule *rr; const struct ieee80211_freq_range *fr = NULL; rr = &regd->reg_rules[i]; fr = &rr->freq_range; /* * We only need to know if one frequency rule was * in center_freq's band, that's enough, so let's * not overwrite it once found */ if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw); if (band_rule_found && bw_fits) return rr; } if (!band_rule_found) return ERR_PTR(-ERANGE); return ERR_PTR(-EINVAL); } static const struct ieee80211_reg_rule * __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) { const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); static const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20}; const struct ieee80211_reg_rule *reg_rule = ERR_PTR(-ERANGE); int i = ARRAY_SIZE(bws) - 1; u32 bw; for (bw = MHZ_TO_KHZ(bws[i]); bw >= min_bw; bw = MHZ_TO_KHZ(bws[i--])) { reg_rule = freq_reg_info_regd(center_freq, regd, bw); if (!IS_ERR(reg_rule)) return reg_rule; } return reg_rule; } const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, u32 center_freq) { u32 min_bw = center_freq < MHZ_TO_KHZ(1000) ? 1 : 20; return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(min_bw)); } EXPORT_SYMBOL(freq_reg_info); const char *reg_initiator_name(enum nl80211_reg_initiator initiator) { switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: return "core"; case NL80211_REGDOM_SET_BY_USER: return "user"; case NL80211_REGDOM_SET_BY_DRIVER: return "driver"; case NL80211_REGDOM_SET_BY_COUNTRY_IE: return "country element"; default: WARN_ON(1); return "bug"; } } EXPORT_SYMBOL(reg_initiator_name); static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd, const struct ieee80211_reg_rule *reg_rule, const struct ieee80211_channel *chan) { const struct ieee80211_freq_range *freq_range = NULL; u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0; bool is_s1g = chan->band == NL80211_BAND_S1GHZ; freq_range = &reg_rule->freq_range; max_bandwidth_khz = freq_range->max_bandwidth_khz; center_freq_khz = ieee80211_channel_to_khz(chan); /* Check if auto calculation requested */ if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (is_s1g) { /* S1G is strict about non overlapping channels. We can * calculate which bandwidth is allowed per channel by finding * the largest bandwidth which cleanly divides the freq_range. */ int edge_offset; int ch_bw = max_bandwidth_khz; while (ch_bw) { edge_offset = (center_freq_khz - ch_bw / 2) - freq_range->start_freq_khz; if (edge_offset % ch_bw == 0) { switch (KHZ_TO_MHZ(ch_bw)) { case 1: bw_flags |= IEEE80211_CHAN_1MHZ; break; case 2: bw_flags |= IEEE80211_CHAN_2MHZ; break; case 4: bw_flags |= IEEE80211_CHAN_4MHZ; break; case 8: bw_flags |= IEEE80211_CHAN_8MHZ; break; case 16: bw_flags |= IEEE80211_CHAN_16MHZ; break; default: /* If we got here, no bandwidths fit on * this frequency, ie. band edge. */ bw_flags |= IEEE80211_CHAN_DISABLED; break; } break; } ch_bw /= 2; } } else { if (max_bandwidth_khz < MHZ_TO_KHZ(10)) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(20)) bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) bw_flags |= IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) bw_flags |= IEEE80211_CHAN_NO_160MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(320)) bw_flags |= IEEE80211_CHAN_NO_320MHZ; } return bw_flags; } static void handle_channel_single_rule(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_channel *chan, u32 flags, struct regulatory_request *lr, struct wiphy *request_wiphy, const struct ieee80211_reg_rule *reg_rule) { u32 bw_flags = 0; const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_regdomain *regd; regd = reg_get_regdomain(wiphy); power_rule = &reg_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { /* * This guarantees the driver's requested regulatory domain * will always be used as a base for further regulatory * settings */ chan->flags = chan->orig_flags = map_regdom_flags(reg_rule->flags) | bw_flags; chan->max_antenna_gain = chan->orig_mag = (int) MBI_TO_DBI(power_rule->max_antenna_gain); chan->max_reg_power = chan->max_power = chan->orig_mpwr = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->flags & IEEE80211_CHAN_RADAR) { chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; if (reg_rule->dfs_cac_ms) chan->dfs_cac_ms = reg_rule->dfs_cac_ms; } return; } chan->dfs_state = NL80211_DFS_USABLE; chan->dfs_state_entered = jiffies; chan->beacon_found = false; chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags); chan->max_antenna_gain = min_t(int, chan->orig_mag, MBI_TO_DBI(power_rule->max_antenna_gain)); chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->flags & IEEE80211_CHAN_RADAR) { if (reg_rule->dfs_cac_ms) chan->dfs_cac_ms = reg_rule->dfs_cac_ms; else chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } if (chan->orig_mpwr) { /* * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER * will always follow the passed country IE power settings. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER) chan->max_power = chan->max_reg_power; else chan->max_power = min(chan->orig_mpwr, chan->max_reg_power); } else chan->max_power = chan->max_reg_power; } static void handle_channel_adjacent_rules(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_channel *chan, u32 flags, struct regulatory_request *lr, struct wiphy *request_wiphy, const struct ieee80211_reg_rule *rrule1, const struct ieee80211_reg_rule *rrule2, struct ieee80211_freq_range *comb_range) { u32 bw_flags1 = 0; u32 bw_flags2 = 0; const struct ieee80211_power_rule *power_rule1 = NULL; const struct ieee80211_power_rule *power_rule2 = NULL; const struct ieee80211_regdomain *regd; regd = reg_get_regdomain(wiphy); power_rule1 = &rrule1->power_rule; power_rule2 = &rrule2->power_rule; bw_flags1 = reg_rule_to_chan_bw_flags(regd, rrule1, chan); bw_flags2 = reg_rule_to_chan_bw_flags(regd, rrule2, chan); if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { /* This guarantees the driver's requested regulatory domain * will always be used as a base for further regulatory * settings */ chan->flags = map_regdom_flags(rrule1->flags) | map_regdom_flags(rrule2->flags) | bw_flags1 | bw_flags2; chan->orig_flags = chan->flags; chan->max_antenna_gain = min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain), MBI_TO_DBI(power_rule2->max_antenna_gain)); chan->orig_mag = chan->max_antenna_gain; chan->max_reg_power = min_t(int, MBM_TO_DBM(power_rule1->max_eirp), MBM_TO_DBM(power_rule2->max_eirp)); chan->max_power = chan->max_reg_power; chan->orig_mpwr = chan->max_reg_power; if (chan->flags & IEEE80211_CHAN_RADAR) { chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms) chan->dfs_cac_ms = max_t(unsigned int, rrule1->dfs_cac_ms, rrule2->dfs_cac_ms); } return; } chan->dfs_state = NL80211_DFS_USABLE; chan->dfs_state_entered = jiffies; chan->beacon_found = false; chan->flags = flags | bw_flags1 | bw_flags2 | map_regdom_flags(rrule1->flags) | map_regdom_flags(rrule2->flags); /* reg_rule_to_chan_bw_flags may forbids 10 and forbids 20 MHz * (otherwise no adj. rule case), recheck therefore */ if (cfg80211_does_bw_fit_range(comb_range, ieee80211_channel_to_khz(chan), MHZ_TO_KHZ(10))) chan->flags &= ~IEEE80211_CHAN_NO_10MHZ; if (cfg80211_does_bw_fit_range(comb_range, ieee80211_channel_to_khz(chan), MHZ_TO_KHZ(20))) chan->flags &= ~IEEE80211_CHAN_NO_20MHZ; chan->max_antenna_gain = min_t(int, chan->orig_mag, min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain), MBI_TO_DBI(power_rule2->max_antenna_gain))); chan->max_reg_power = min_t(int, MBM_TO_DBM(power_rule1->max_eirp), MBM_TO_DBM(power_rule2->max_eirp)); if (chan->flags & IEEE80211_CHAN_RADAR) { if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms) chan->dfs_cac_ms = max_t(unsigned int, rrule1->dfs_cac_ms, rrule2->dfs_cac_ms); else chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } if (chan->orig_mpwr) { /* Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER * will always follow the passed country IE power settings. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER) chan->max_power = chan->max_reg_power; else chan->max_power = min(chan->orig_mpwr, chan->max_reg_power); } else { chan->max_power = chan->max_reg_power; } } /* Note that right now we assume the desired channel bandwidth * is always 20 MHz for each individual channel (HT40 uses 20 MHz * per channel, the primary and the extension channel). */ static void handle_channel(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_channel *chan) { const u32 orig_chan_freq = ieee80211_channel_to_khz(chan); struct regulatory_request *lr = get_last_request(); struct wiphy *request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); const struct ieee80211_reg_rule *rrule = NULL; const struct ieee80211_reg_rule *rrule1 = NULL; const struct ieee80211_reg_rule *rrule2 = NULL; u32 flags = chan->orig_flags; rrule = freq_reg_info(wiphy, orig_chan_freq); if (IS_ERR(rrule)) { /* check for adjacent match, therefore get rules for * chan - 20 MHz and chan + 20 MHz and test * if reg rules are adjacent */ rrule1 = freq_reg_info(wiphy, orig_chan_freq - MHZ_TO_KHZ(20)); rrule2 = freq_reg_info(wiphy, orig_chan_freq + MHZ_TO_KHZ(20)); if (!IS_ERR(rrule1) && !IS_ERR(rrule2)) { struct ieee80211_freq_range comb_range; if (rrule1->freq_range.end_freq_khz != rrule2->freq_range.start_freq_khz) goto disable_chan; comb_range.start_freq_khz = rrule1->freq_range.start_freq_khz; comb_range.end_freq_khz = rrule2->freq_range.end_freq_khz; comb_range.max_bandwidth_khz = min_t(u32, rrule1->freq_range.max_bandwidth_khz, rrule2->freq_range.max_bandwidth_khz); if (!cfg80211_does_bw_fit_range(&comb_range, orig_chan_freq, MHZ_TO_KHZ(20))) goto disable_chan; handle_channel_adjacent_rules(wiphy, initiator, chan, flags, lr, request_wiphy, rrule1, rrule2, &comb_range); return; } disable_chan: /* We will disable all channels that do not match our * received regulatory rule unless the hint is coming * from a Country IE and the Country IE had no information * about a band. The IEEE 802.11 spec allows for an AP * to send only a subset of the regulatory rules allowed, * so an AP in the US that only supports 2.4 GHz may only send * a country IE with information for the 2.4 GHz band * while 5 GHz is still supported. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && PTR_ERR(rrule) == -ERANGE) return; if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { pr_debug("Disabling freq %d.%03d MHz for good\n", chan->center_freq, chan->freq_offset); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { pr_debug("Disabling freq %d.%03d MHz\n", chan->center_freq, chan->freq_offset); chan->flags |= IEEE80211_CHAN_DISABLED; } return; } handle_channel_single_rule(wiphy, initiator, chan, flags, lr, request_wiphy, rrule); } static void handle_band(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_supported_band *sband) { unsigned int i; if (!sband) return; for (i = 0; i < sband->n_channels; i++) handle_channel(wiphy, initiator, &sband->channels[i]); } static bool reg_request_cell_base(struct regulatory_request *request) { if (request->initiator != NL80211_REGDOM_SET_BY_USER) return false; return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE; } bool reg_last_request_cell_base(void) { return reg_request_cell_base(get_last_request()); } #ifdef CONFIG_CFG80211_REG_CELLULAR_HINTS /* Core specific check */ static enum reg_request_treatment reg_ignore_cell_hint(struct regulatory_request *pending_request) { struct regulatory_request *lr = get_last_request(); if (!reg_num_devs_support_basehint) return REG_REQ_IGNORE; if (reg_request_cell_base(lr) && !regdom_changes(pending_request->alpha2)) return REG_REQ_ALREADY_SET; return REG_REQ_OK; } /* Device specific check */ static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) { return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS); } #else static enum reg_request_treatment reg_ignore_cell_hint(struct regulatory_request *pending_request) { return REG_REQ_IGNORE; } static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) { return true; } #endif static bool wiphy_strict_alpha2_regd(struct wiphy *wiphy) { if (wiphy->regulatory_flags & REGULATORY_STRICT_REG && !(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)) return true; return false; } static bool ignore_reg_update(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { struct regulatory_request *lr = get_last_request(); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) return true; if (!lr) { pr_debug("Ignoring regulatory request set by %s since last_request is not set\n", reg_initiator_name(initiator)); return true; } if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n", reg_initiator_name(initiator)); return true; } /* * wiphy->regd will be set once the device has its own * desired regulatory domain set */ if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd && initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && !is_world_regdom(lr->alpha2)) { pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n", reg_initiator_name(initiator)); return true; } if (reg_request_cell_base(lr)) return reg_dev_ignore_cell_hint(wiphy); return false; } static bool reg_is_world_roaming(struct wiphy *wiphy) { const struct ieee80211_regdomain *cr = get_cfg80211_regdom(); const struct ieee80211_regdomain *wr = get_wiphy_regdom(wiphy); struct regulatory_request *lr = get_last_request(); if (is_world_regdom(cr->alpha2) || (wr && is_world_regdom(wr->alpha2))) return true; if (lr && lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) return true; return false; } static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, struct reg_beacon *reg_beacon) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; bool channel_changed = false; struct ieee80211_channel chan_before; sband = wiphy->bands[reg_beacon->chan.band]; chan = &sband->channels[chan_idx]; if (likely(!ieee80211_channel_equal(chan, &reg_beacon->chan))) return; if (chan->beacon_found) return; chan->beacon_found = true; if (!reg_is_world_roaming(wiphy)) return; if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS) return; chan_before = *chan; if (chan->flags & IEEE80211_CHAN_NO_IR) { chan->flags &= ~IEEE80211_CHAN_NO_IR; channel_changed = true; } if (channel_changed) nl80211_send_beacon_hint_event(wiphy, &chan_before, chan); } /* * Called when a scan on a wiphy finds a beacon on * new channel */ static void wiphy_update_new_beacon(struct wiphy *wiphy, struct reg_beacon *reg_beacon) { unsigned int i; struct ieee80211_supported_band *sband; if (!wiphy->bands[reg_beacon->chan.band]) return; sband = wiphy->bands[reg_beacon->chan.band]; for (i = 0; i < sband->n_channels; i++) handle_reg_beacon(wiphy, i, reg_beacon); } /* * Called upon reg changes or a new wiphy is added */ static void wiphy_update_beacon_reg(struct wiphy *wiphy) { unsigned int i; struct ieee80211_supported_band *sband; struct reg_beacon *reg_beacon; list_for_each_entry(reg_beacon, &reg_beacon_list, list) { if (!wiphy->bands[reg_beacon->chan.band]) continue; sband = wiphy->bands[reg_beacon->chan.band]; for (i = 0; i < sband->n_channels; i++) handle_reg_beacon(wiphy, i, reg_beacon); } } /* Reap the advantages of previously found beacons */ static void reg_process_beacons(struct wiphy *wiphy) { /* * Means we are just firing up cfg80211, so no beacons would * have been processed yet. */ if (!last_request) return; wiphy_update_beacon_reg(wiphy); } static bool is_ht40_allowed(struct ieee80211_channel *chan) { if (!chan) return false; if (chan->flags & IEEE80211_CHAN_DISABLED) return false; /* This would happen when regulatory rules disallow HT40 completely */ if ((chan->flags & IEEE80211_CHAN_NO_HT40) == IEEE80211_CHAN_NO_HT40) return false; return true; } static void reg_process_ht_flags_channel(struct wiphy *wiphy, struct ieee80211_channel *channel) { struct ieee80211_supported_band *sband = wiphy->bands[channel->band]; struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; const struct ieee80211_regdomain *regd; unsigned int i; u32 flags; if (!is_ht40_allowed(channel)) { channel->flags |= IEEE80211_CHAN_NO_HT40; return; } /* * We need to ensure the extension channels exist to * be able to use HT40- or HT40+, this finds them (or not) */ for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *c = &sband->channels[i]; if (c->center_freq == (channel->center_freq - 20)) channel_before = c; if (c->center_freq == (channel->center_freq + 20)) channel_after = c; } flags = 0; regd = get_wiphy_regdom(wiphy); if (regd) { const struct ieee80211_reg_rule *reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq), regd, MHZ_TO_KHZ(20)); if (!IS_ERR(reg_rule)) flags = reg_rule->flags; } /* * Please note that this assumes target bandwidth is 20 MHz, * if that ever changes we also need to change the below logic * to include that as well. */ if (!is_ht40_allowed(channel_before) || flags & NL80211_RRF_NO_HT40MINUS) channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; if (!is_ht40_allowed(channel_after) || flags & NL80211_RRF_NO_HT40PLUS) channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; } static void reg_process_ht_flags_band(struct wiphy *wiphy, struct ieee80211_supported_band *sband) { unsigned int i; if (!sband) return; for (i = 0; i < sband->n_channels; i++) reg_process_ht_flags_channel(wiphy, &sband->channels[i]); } static void reg_process_ht_flags(struct wiphy *wiphy) { enum nl80211_band band; if (!wiphy) return; for (band = 0; band < NUM_NL80211_BANDS; band++) reg_process_ht_flags_band(wiphy, wiphy->bands[band]); } static void reg_call_notifier(struct wiphy *wiphy, struct regulatory_request *request) { if (wiphy->reg_notifier) wiphy->reg_notifier(wiphy, request); } static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) { struct cfg80211_chan_def chandef = {}; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); enum nl80211_iftype iftype; bool ret; int link; wdev_lock(wdev); iftype = wdev->iftype; /* make sure the interface is active */ if (!wdev->netdev || !netif_running(wdev->netdev)) goto wdev_inactive_unlock; for (link = 0; link < ARRAY_SIZE(wdev->links); link++) { struct ieee80211_channel *chan; if (!wdev->valid_links && link > 0) break; if (wdev->valid_links && !(wdev->valid_links & BIT(link))) continue; switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: if (!wdev->links[link].ap.beacon_interval) continue; chandef = wdev->links[link].ap.chandef; break; case NL80211_IFTYPE_MESH_POINT: if (!wdev->u.mesh.beacon_interval) continue; chandef = wdev->u.mesh.chandef; break; case NL80211_IFTYPE_ADHOC: if (!wdev->u.ibss.ssid_len) continue; chandef = wdev->u.ibss.chandef; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* Maybe we could consider disabling that link only? */ if (!wdev->links[link].client.current_bss) continue; chan = wdev->links[link].client.current_bss->pub.channel; if (!chan) continue; if (!rdev->ops->get_channel || rdev_get_channel(rdev, wdev, link, &chandef)) cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_NO_HT); break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_DEVICE: /* no enforcement required */ break; case NL80211_IFTYPE_OCB: if (!wdev->u.ocb.chandef.chan) continue; chandef = wdev->u.ocb.chandef; break; case NL80211_IFTYPE_NAN: /* we have no info, but NAN is also pretty universal */ continue; default: /* others not implemented for now */ WARN_ON_ONCE(1); break; } wdev_unlock(wdev); switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype); if (!ret) return ret; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: ret = cfg80211_chandef_usable(wiphy, &chandef, IEEE80211_CHAN_DISABLED); if (!ret) return ret; break; default: break; } wdev_lock(wdev); } wdev_unlock(wdev); return true; wdev_inactive_unlock: wdev_unlock(wdev); return true; } static void reg_leave_invalid_chans(struct wiphy *wiphy) { struct wireless_dev *wdev; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); wiphy_lock(wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) cfg80211_leave(rdev, wdev); wiphy_unlock(wiphy); } static void reg_check_chans_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; pr_debug("Verifying active interfaces after reg change\n"); rtnl_lock(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) reg_leave_invalid_chans(&rdev->wiphy); rtnl_unlock(); } static void reg_check_channels(void) { /* * Give usermode a chance to do something nicer (move to another * channel, orderly disconnection), before forcing a disconnection. */ mod_delayed_work(system_power_efficient_wq, &reg_check_chans, msecs_to_jiffies(REG_ENFORCE_GRACE_MS)); } static void wiphy_update_regulatory(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { enum nl80211_band band; struct regulatory_request *lr = get_last_request(); if (ignore_reg_update(wiphy, initiator)) { /* * Regulatory updates set by CORE are ignored for custom * regulatory cards. Let us notify the changes to the driver, * as some drivers used this to restore its orig_* reg domain. */ if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG && !(wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)) reg_call_notifier(wiphy, lr); return; } lr->dfs_region = get_cfg80211_regdom()->dfs_region; for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band(wiphy, initiator, wiphy->bands[band]); reg_process_beacons(wiphy); reg_process_ht_flags(wiphy); reg_call_notifier(wiphy, lr); } static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) { wiphy = &rdev->wiphy; wiphy_update_regulatory(wiphy, initiator); } reg_check_channels(); } static void handle_channel_custom(struct wiphy *wiphy, struct ieee80211_channel *chan, const struct ieee80211_regdomain *regd, u32 min_bw) { u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; u32 bw, center_freq_khz; center_freq_khz = ieee80211_channel_to_khz(chan); for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw); if (!IS_ERR(reg_rule)) break; } if (IS_ERR_OR_NULL(reg_rule)) { pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n", chan->center_freq, chan->freq_offset); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } return; } power_rule = &reg_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); chan->dfs_state_entered = jiffies; chan->dfs_state = NL80211_DFS_USABLE; chan->beacon_found = false; if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) chan->flags = chan->orig_flags | bw_flags | map_regdom_flags(reg_rule->flags); else chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags; chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain); chan->max_reg_power = chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->flags & IEEE80211_CHAN_RADAR) { if (reg_rule->dfs_cac_ms) chan->dfs_cac_ms = reg_rule->dfs_cac_ms; else chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } chan->max_power = chan->max_reg_power; } static void handle_band_custom(struct wiphy *wiphy, struct ieee80211_supported_band *sband, const struct ieee80211_regdomain *regd) { unsigned int i; if (!sband) return; /* * We currently assume that you always want at least 20 MHz, * otherwise channel 12 might get enabled if this rule is * compatible to US, which permits 2402 - 2472 MHz. */ for (i = 0; i < sband->n_channels; i++) handle_channel_custom(wiphy, &sband->channels[i], regd, MHZ_TO_KHZ(20)); } /* Used by drivers prior to wiphy registration */ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, const struct ieee80211_regdomain *regd) { const struct ieee80211_regdomain *new_regd, *tmp; enum nl80211_band band; unsigned int bands_set = 0; WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG), "wiphy should have REGULATORY_CUSTOM_REG\n"); wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!wiphy->bands[band]) continue; handle_band_custom(wiphy, wiphy->bands[band], regd); bands_set++; } /* * no point in calling this if it won't have any effect * on your device's supported bands. */ WARN_ON(!bands_set); new_regd = reg_copy_regd(regd); if (IS_ERR(new_regd)) return; rtnl_lock(); wiphy_lock(wiphy); tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, new_regd); rcu_free_regdom(tmp); wiphy_unlock(wiphy); rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); static void reg_set_request_processed(void) { bool need_more_processing = false; struct regulatory_request *lr = get_last_request(); lr->processed = true; spin_lock(&reg_requests_lock); if (!list_empty(&reg_requests_list)) need_more_processing = true; spin_unlock(&reg_requests_lock); cancel_crda_timeout(); if (need_more_processing) schedule_work(&reg_work); } /** * reg_process_hint_core - process core regulatory requests * @core_request: a pending core regulatory request * * The wireless subsystem can use this function to process * a regulatory request issued by the regulatory core. */ static enum reg_request_treatment reg_process_hint_core(struct regulatory_request *core_request) { if (reg_query_database(core_request)) { core_request->intersect = false; core_request->processed = false; reg_update_last_request(core_request); return REG_REQ_OK; } return REG_REQ_IGNORE; } static enum reg_request_treatment __reg_process_hint_user(struct regulatory_request *user_request) { struct regulatory_request *lr = get_last_request(); if (reg_request_cell_base(user_request)) return reg_ignore_cell_hint(user_request); if (reg_request_cell_base(lr)) return REG_REQ_IGNORE; if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) return REG_REQ_INTERSECT; /* * If the user knows better the user should set the regdom * to their country before the IE is picked up */ if (lr->initiator == NL80211_REGDOM_SET_BY_USER && lr->intersect) return REG_REQ_IGNORE; /* * Process user requests only after previous user/driver/core * requests have been processed */ if ((lr->initiator == NL80211_REGDOM_SET_BY_CORE || lr->initiator == NL80211_REGDOM_SET_BY_DRIVER || lr->initiator == NL80211_REGDOM_SET_BY_USER) && regdom_changes(lr->alpha2)) return REG_REQ_IGNORE; if (!regdom_changes(user_request->alpha2)) return REG_REQ_ALREADY_SET; return REG_REQ_OK; } /** * reg_process_hint_user - process user regulatory requests * @user_request: a pending user regulatory request * * The wireless subsystem can use this function to process * a regulatory request initiated by userspace. */ static enum reg_request_treatment reg_process_hint_user(struct regulatory_request *user_request) { enum reg_request_treatment treatment; treatment = __reg_process_hint_user(user_request); if (treatment == REG_REQ_IGNORE || treatment == REG_REQ_ALREADY_SET) return REG_REQ_IGNORE; user_request->intersect = treatment == REG_REQ_INTERSECT; user_request->processed = false; if (reg_query_database(user_request)) { reg_update_last_request(user_request); user_alpha2[0] = user_request->alpha2[0]; user_alpha2[1] = user_request->alpha2[1]; return REG_REQ_OK; } return REG_REQ_IGNORE; } static enum reg_request_treatment __reg_process_hint_driver(struct regulatory_request *driver_request) { struct regulatory_request *lr = get_last_request(); if (lr->initiator == NL80211_REGDOM_SET_BY_CORE) { if (regdom_changes(driver_request->alpha2)) return REG_REQ_OK; return REG_REQ_ALREADY_SET; } /* * This would happen if you unplug and plug your card * back in or if you add a new device for which the previously * loaded card also agrees on the regulatory domain. */ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && !regdom_changes(driver_request->alpha2)) return REG_REQ_ALREADY_SET; return REG_REQ_INTERSECT; } /** * reg_process_hint_driver - process driver regulatory requests * @wiphy: the wireless device for the regulatory request * @driver_request: a pending driver regulatory request * * The wireless subsystem can use this function to process * a regulatory request issued by an 802.11 driver. * * Returns one of the different reg request treatment values. */ static enum reg_request_treatment reg_process_hint_driver(struct wiphy *wiphy, struct regulatory_request *driver_request) { const struct ieee80211_regdomain *regd, *tmp; enum reg_request_treatment treatment; treatment = __reg_process_hint_driver(driver_request); switch (treatment) { case REG_REQ_OK: break; case REG_REQ_IGNORE: return REG_REQ_IGNORE; case REG_REQ_INTERSECT: case REG_REQ_ALREADY_SET: regd = reg_copy_regd(get_cfg80211_regdom()); if (IS_ERR(regd)) return REG_REQ_IGNORE; tmp = get_wiphy_regdom(wiphy); ASSERT_RTNL(); wiphy_lock(wiphy); rcu_assign_pointer(wiphy->regd, regd); wiphy_unlock(wiphy); rcu_free_regdom(tmp); } driver_request->intersect = treatment == REG_REQ_INTERSECT; driver_request->processed = false; /* * Since CRDA will not be called in this case as we already * have applied the requested regulatory domain before we just * inform userspace we have processed the request */ if (treatment == REG_REQ_ALREADY_SET) { nl80211_send_reg_change_event(driver_request); reg_update_last_request(driver_request); reg_set_request_processed(); return REG_REQ_ALREADY_SET; } if (reg_query_database(driver_request)) { reg_update_last_request(driver_request); return REG_REQ_OK; } return REG_REQ_IGNORE; } static enum reg_request_treatment __reg_process_hint_country_ie(struct wiphy *wiphy, struct regulatory_request *country_ie_request) { struct wiphy *last_wiphy = NULL; struct regulatory_request *lr = get_last_request(); if (reg_request_cell_base(lr)) { /* Trust a Cell base station over the AP's country IE */ if (regdom_changes(country_ie_request->alpha2)) return REG_REQ_IGNORE; return REG_REQ_ALREADY_SET; } else { if (wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_IGNORE) return REG_REQ_IGNORE; } if (unlikely(!is_an_alpha2(country_ie_request->alpha2))) return -EINVAL; if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) return REG_REQ_OK; last_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (last_wiphy != wiphy) { /* * Two cards with two APs claiming different * Country IE alpha2s. We could * intersect them, but that seems unlikely * to be correct. Reject second one for now. */ if (regdom_changes(country_ie_request->alpha2)) return REG_REQ_IGNORE; return REG_REQ_ALREADY_SET; } if (regdom_changes(country_ie_request->alpha2)) return REG_REQ_OK; return REG_REQ_ALREADY_SET; } /** * reg_process_hint_country_ie - process regulatory requests from country IEs * @wiphy: the wireless device for the regulatory request * @country_ie_request: a regulatory request from a country IE * * The wireless subsystem can use this function to process * a regulatory request issued by a country Information Element. * * Returns one of the different reg request treatment values. */ static enum reg_request_treatment reg_process_hint_country_ie(struct wiphy *wiphy, struct regulatory_request *country_ie_request) { enum reg_request_treatment treatment; treatment = __reg_process_hint_country_ie(wiphy, country_ie_request); switch (treatment) { case REG_REQ_OK: break; case REG_REQ_IGNORE: return REG_REQ_IGNORE; case REG_REQ_ALREADY_SET: reg_free_request(country_ie_request); return REG_REQ_ALREADY_SET; case REG_REQ_INTERSECT: /* * This doesn't happen yet, not sure we * ever want to support it for this case. */ WARN_ONCE(1, "Unexpected intersection for country elements"); return REG_REQ_IGNORE; } country_ie_request->intersect = false; country_ie_request->processed = false; if (reg_query_database(country_ie_request)) { reg_update_last_request(country_ie_request); return REG_REQ_OK; } return REG_REQ_IGNORE; } bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2) { const struct ieee80211_regdomain *wiphy1_regd = NULL; const struct ieee80211_regdomain *wiphy2_regd = NULL; const struct ieee80211_regdomain *cfg80211_regd = NULL; bool dfs_domain_same; rcu_read_lock(); cfg80211_regd = rcu_dereference(cfg80211_regdomain); wiphy1_regd = rcu_dereference(wiphy1->regd); if (!wiphy1_regd) wiphy1_regd = cfg80211_regd; wiphy2_regd = rcu_dereference(wiphy2->regd); if (!wiphy2_regd) wiphy2_regd = cfg80211_regd; dfs_domain_same = wiphy1_regd->dfs_region == wiphy2_regd->dfs_region; rcu_read_unlock(); return dfs_domain_same; } static void reg_copy_dfs_chan_state(struct ieee80211_channel *dst_chan, struct ieee80211_channel *src_chan) { if (!(dst_chan->flags & IEEE80211_CHAN_RADAR) || !(src_chan->flags & IEEE80211_CHAN_RADAR)) return; if (dst_chan->flags & IEEE80211_CHAN_DISABLED || src_chan->flags & IEEE80211_CHAN_DISABLED) return; if (src_chan->center_freq == dst_chan->center_freq && dst_chan->dfs_state == NL80211_DFS_USABLE) { dst_chan->dfs_state = src_chan->dfs_state; dst_chan->dfs_state_entered = src_chan->dfs_state_entered; } } static void wiphy_share_dfs_chan_state(struct wiphy *dst_wiphy, struct wiphy *src_wiphy) { struct ieee80211_supported_band *src_sband, *dst_sband; struct ieee80211_channel *src_chan, *dst_chan; int i, j, band; if (!reg_dfs_domain_same(dst_wiphy, src_wiphy)) return; for (band = 0; band < NUM_NL80211_BANDS; band++) { dst_sband = dst_wiphy->bands[band]; src_sband = src_wiphy->bands[band]; if (!dst_sband || !src_sband) continue; for (i = 0; i < dst_sband->n_channels; i++) { dst_chan = &dst_sband->channels[i]; for (j = 0; j < src_sband->n_channels; j++) { src_chan = &src_sband->channels[j]; reg_copy_dfs_chan_state(dst_chan, src_chan); } } } } static void wiphy_all_share_dfs_chan_state(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (wiphy == &rdev->wiphy) continue; wiphy_share_dfs_chan_state(wiphy, &rdev->wiphy); } } /* This processes *all* regulatory hints */ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; case NL80211_REGDOM_SET_BY_USER: treatment = reg_process_hint_user(reg_request); break; case NL80211_REGDOM_SET_BY_DRIVER: if (!wiphy) goto out_free; treatment = reg_process_hint_driver(wiphy, reg_request); break; case NL80211_REGDOM_SET_BY_COUNTRY_IE: if (!wiphy) goto out_free; treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: WARN(1, "invalid initiator %d\n", initiator); goto out_free; } if (treatment == REG_REQ_IGNORE) goto out_free; WARN(treatment != REG_REQ_OK && treatment != REG_REQ_ALREADY_SET, "unexpected treatment value %d\n", treatment); /* This is required so that the orig_* parameters are saved. * NOTE: treatment must be set for any case that reaches here! */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } return; out_free: reg_free_request(reg_request); } static void notify_self_managed_wiphys(struct regulatory_request *request) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { wiphy = &rdev->wiphy; if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED && request->initiator == NL80211_REGDOM_SET_BY_USER) reg_call_notifier(wiphy, request); } } /* * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* * Regulatory hints come on a first come first serve basis and we * must process each one atomically. */ static void reg_process_pending_hints(void) { struct regulatory_request *reg_request, *lr; lr = get_last_request(); /* When last_request->processed becomes true this will be rescheduled */ if (lr && !lr->processed) { pr_debug("Pending regulatory request, waiting for it to be processed...\n"); return; } spin_lock(&reg_requests_lock); if (list_empty(&reg_requests_list)) { spin_unlock(&reg_requests_lock); return; } reg_request = list_first_entry(&reg_requests_list, struct regulatory_request, list); list_del_init(&reg_request->list); spin_unlock(&reg_requests_lock); notify_self_managed_wiphys(reg_request); reg_process_hint(reg_request); lr = get_last_request(); spin_lock(&reg_requests_lock); if (!list_empty(&reg_requests_list) && lr && lr->processed) schedule_work(&reg_work); spin_unlock(&reg_requests_lock); } /* Processes beacon hints -- this has nothing to do with country IEs */ static void reg_process_pending_beacon_hints(void) { struct cfg80211_registered_device *rdev; struct reg_beacon *pending_beacon, *tmp; /* This goes through the _pending_ beacon list */ spin_lock_bh(&reg_pending_beacons_lock); list_for_each_entry_safe(pending_beacon, tmp, &reg_pending_beacons, list) { list_del_init(&pending_beacon->list); /* Applies the beacon hint to current wiphys */ list_for_each_entry(rdev, &cfg80211_rdev_list, list) wiphy_update_new_beacon(&rdev->wiphy, pending_beacon); /* Remembers the beacon hint for new wiphys or reg changes */ list_add_tail(&pending_beacon->list, &reg_beacon_list); } spin_unlock_bh(&reg_pending_beacons_lock); } static void reg_process_self_managed_hint(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const struct ieee80211_regdomain *tmp; const struct ieee80211_regdomain *regd; enum nl80211_band band; struct regulatory_request request = {}; ASSERT_RTNL(); lockdep_assert_wiphy(wiphy); spin_lock(&reg_requests_lock); regd = rdev->requested_regd; rdev->requested_regd = NULL; spin_unlock(&reg_requests_lock); if (!regd) return; tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, regd); rcu_free_regdom(tmp); for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band_custom(wiphy, wiphy->bands[band], regd); reg_process_ht_flags(wiphy); request.wiphy_idx = get_wiphy_idx(wiphy); request.alpha2[0] = regd->alpha2[0]; request.alpha2[1] = regd->alpha2[1]; request.initiator = NL80211_REGDOM_SET_BY_DRIVER; if (wiphy->flags & WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER) reg_call_notifier(wiphy, &request); nl80211_send_wiphy_reg_change_event(&request); } static void reg_process_self_managed_hints(void) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) { wiphy_lock(&rdev->wiphy); reg_process_self_managed_hint(&rdev->wiphy); wiphy_unlock(&rdev->wiphy); } reg_check_channels(); } static void reg_todo(struct work_struct *work) { rtnl_lock(); reg_process_pending_hints(); reg_process_pending_beacon_hints(); reg_process_self_managed_hints(); rtnl_unlock(); } static void queue_regulatory_request(struct regulatory_request *request) { request->alpha2[0] = toupper(request->alpha2[0]); request->alpha2[1] = toupper(request->alpha2[1]); spin_lock(&reg_requests_lock); list_add_tail(&request->list, &reg_requests_list); spin_unlock(&reg_requests_lock); schedule_work(&reg_work); } /* * Core regulatory hint -- happens during cfg80211_init() * and when we restore regulatory settings. */ static int regulatory_hint_core(const char *alpha2) { struct regulatory_request *request; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; request->wiphy_idx = WIPHY_IDX_INVALID; queue_regulatory_request(request); return 0; } /* User hints */ int regulatory_hint_user(const char *alpha2, enum nl80211_user_reg_hint_type user_reg_hint_type) { struct regulatory_request *request; if (WARN_ON(!alpha2)) return -EINVAL; if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2)) return -EINVAL; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; request->wiphy_idx = WIPHY_IDX_INVALID; request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_USER; request->user_reg_hint_type = user_reg_hint_type; /* Allow calling CRDA again */ reset_crda_timeouts(); queue_regulatory_request(request); return 0; } int regulatory_hint_indoor(bool is_indoor, u32 portid) { spin_lock(&reg_indoor_lock); /* It is possible that more than one user space process is trying to * configure the indoor setting. To handle such cases, clear the indoor * setting in case that some process does not think that the device * is operating in an indoor environment. In addition, if a user space * process indicates that it is controlling the indoor setting, save its * portid, i.e., make it the owner. */ reg_is_indoor = is_indoor; if (reg_is_indoor) { if (!reg_is_indoor_portid) reg_is_indoor_portid = portid; } else { reg_is_indoor_portid = 0; } spin_unlock(&reg_indoor_lock); if (!is_indoor) reg_check_channels(); return 0; } void regulatory_netlink_notify(u32 portid) { spin_lock(&reg_indoor_lock); if (reg_is_indoor_portid != portid) { spin_unlock(&reg_indoor_lock); return; } reg_is_indoor = false; reg_is_indoor_portid = 0; spin_unlock(&reg_indoor_lock); reg_check_channels(); } /* Driver hints */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { struct regulatory_request *request; if (WARN_ON(!alpha2 || !wiphy)) return -EINVAL; wiphy->regulatory_flags &= ~REGULATORY_CUSTOM_REG; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; request->wiphy_idx = get_wiphy_idx(wiphy); request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_DRIVER; /* Allow calling CRDA again */ reset_crda_timeouts(); queue_regulatory_request(request); return 0; } EXPORT_SYMBOL(regulatory_hint); void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band, const u8 *country_ie, u8 country_ie_len) { char alpha2[2]; enum environment_cap env = ENVIRON_ANY; struct regulatory_request *request = NULL, *lr; /* IE len must be evenly divisible by 2 */ if (country_ie_len & 0x01) return; if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) return; request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return; alpha2[0] = country_ie[0]; alpha2[1] = country_ie[1]; if (country_ie[2] == 'I') env = ENVIRON_INDOOR; else if (country_ie[2] == 'O') env = ENVIRON_OUTDOOR; rcu_read_lock(); lr = get_last_request(); if (unlikely(!lr)) goto out; /* * We will run this only upon a successful connection on cfg80211. * We leave conflict resolution to the workqueue, where can hold * the RTNL. */ if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && lr->wiphy_idx != WIPHY_IDX_INVALID) goto out; request->wiphy_idx = get_wiphy_idx(wiphy); request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; request->country_ie_env = env; /* Allow calling CRDA again */ reset_crda_timeouts(); queue_regulatory_request(request); request = NULL; out: kfree(request); rcu_read_unlock(); } static void restore_alpha2(char *alpha2, bool reset_user) { /* indicates there is no alpha2 to consider for restoration */ alpha2[0] = '9'; alpha2[1] = '7'; /* The user setting has precedence over the module parameter */ if (is_user_regdom_saved()) { /* Unless we're asked to ignore it and reset it */ if (reset_user) { pr_debug("Restoring regulatory settings including user preference\n"); user_alpha2[0] = '9'; user_alpha2[1] = '7'; /* * If we're ignoring user settings, we still need to * check the module parameter to ensure we put things * back as they were for a full restore. */ if (!is_world_regdom(ieee80211_regdom)) { pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } } else { pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n", user_alpha2[0], user_alpha2[1]); alpha2[0] = user_alpha2[0]; alpha2[1] = user_alpha2[1]; } } else if (!is_world_regdom(ieee80211_regdom)) { pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } else pr_debug("Restoring regulatory settings\n"); } static void restore_custom_reg_settings(struct wiphy *wiphy) { struct ieee80211_supported_band *sband; enum nl80211_band band; struct ieee80211_channel *chan; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { chan = &sband->channels[i]; chan->flags = chan->orig_flags; chan->max_antenna_gain = chan->orig_mag; chan->max_power = chan->orig_mpwr; chan->beacon_found = false; } } } /* * Restoring regulatory settings involves ignoring any * possibly stale country IE information and user regulatory * settings if so desired, this includes any beacon hints * learned as we could have traveled outside to another country * after disconnection. To restore regulatory settings we do * exactly what we did at bootup: * * - send a core regulatory hint * - send a user regulatory hint if applicable * * Device drivers that send a regulatory hint for a specific country * keep their own regulatory domain on wiphy->regd so that does * not need to be remembered. */ static void restore_regulatory_settings(bool reset_user, bool cached) { char alpha2[2]; char world_alpha2[2]; struct reg_beacon *reg_beacon, *btmp; LIST_HEAD(tmp_reg_req_list); struct cfg80211_registered_device *rdev; ASSERT_RTNL(); /* * Clear the indoor setting in case that it is not controlled by user * space, as otherwise there is no guarantee that the device is still * operating in an indoor environment. */ spin_lock(&reg_indoor_lock); if (reg_is_indoor && !reg_is_indoor_portid) { reg_is_indoor = false; reg_check_channels(); } spin_unlock(&reg_indoor_lock); reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); /* * If there's any pending requests we simply * stash them to a temporary pending queue and * add then after we've restored regulatory * settings. */ spin_lock(&reg_requests_lock); list_splice_tail_init(&reg_requests_list, &tmp_reg_req_list); spin_unlock(&reg_requests_lock); /* Clear beacon hints */ spin_lock_bh(&reg_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) { list_del(&reg_beacon->list); kfree(reg_beacon); } spin_unlock_bh(&reg_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) { list_del(&reg_beacon->list); kfree(reg_beacon); } /* First restore to the basic regulatory settings */ world_alpha2[0] = cfg80211_world_regdom->alpha2[0]; world_alpha2[1] = cfg80211_world_regdom->alpha2[1]; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (rdev->wiphy.regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) continue; if (rdev->wiphy.regulatory_flags & REGULATORY_CUSTOM_REG) restore_custom_reg_settings(&rdev->wiphy); } if (cached && (!is_an_alpha2(alpha2) || !IS_ERR_OR_NULL(cfg80211_user_regdom))) { reset_regdomains(false, cfg80211_world_regdom); update_all_wiphy_regulatory(NL80211_REGDOM_SET_BY_CORE); print_regdomain(get_cfg80211_regdom()); nl80211_send_reg_change_event(&core_request_world); reg_set_request_processed(); if (is_an_alpha2(alpha2) && !regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER)) { struct regulatory_request *ureq; spin_lock(&reg_requests_lock); ureq = list_last_entry(&reg_requests_list, struct regulatory_request, list); list_del(&ureq->list); spin_unlock(&reg_requests_lock); notify_self_managed_wiphys(ureq); reg_update_last_request(ureq); set_regdom(reg_copy_regd(cfg80211_user_regdom), REGD_SOURCE_CACHED); } } else { regulatory_hint_core(world_alpha2); /* * This restores the ieee80211_regdom module parameter * preference or the last user requested regulatory * settings, user regulatory settings takes precedence. */ if (is_an_alpha2(alpha2)) regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER); } spin_lock(&reg_requests_lock); list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list); spin_unlock(&reg_requests_lock); pr_debug("Kicking the queue\n"); schedule_work(&reg_work); } static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); if (!(wdev->wiphy->regulatory_flags & flag)) { wdev_unlock(wdev); return false; } wdev_unlock(wdev); } } return true; } void regulatory_hint_disconnect(void) { /* Restore of regulatory settings is not required when wiphy(s) * ignore IE from connected access point but clearance of beacon hints * is required when wiphy(s) supports beacon hints. */ if (is_wiphy_all_set_reg_flag(REGULATORY_COUNTRY_IE_IGNORE)) { struct reg_beacon *reg_beacon, *btmp; if (is_wiphy_all_set_reg_flag(REGULATORY_DISABLE_BEACON_HINTS)) return; spin_lock_bh(&reg_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) { list_del(&reg_beacon->list); kfree(reg_beacon); } spin_unlock_bh(&reg_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) { list_del(&reg_beacon->list); kfree(reg_beacon); } return; } pr_debug("All devices are disconnected, going to restore regulatory settings\n"); restore_regulatory_settings(false, true); } static bool freq_is_chan_12_13_14(u32 freq) { if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) || freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) || freq == ieee80211_channel_to_frequency(14, NL80211_BAND_2GHZ)) return true; return false; } static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) { struct reg_beacon *pending_beacon; list_for_each_entry(pending_beacon, &reg_pending_beacons, list) if (ieee80211_channel_equal(beacon_chan, &pending_beacon->chan)) return true; return false; } int regulatory_hint_found_beacon(struct wiphy *wiphy, struct ieee80211_channel *beacon_chan, gfp_t gfp) { struct reg_beacon *reg_beacon; bool processing; if (beacon_chan->beacon_found || beacon_chan->flags & IEEE80211_CHAN_RADAR || (beacon_chan->band == NL80211_BAND_2GHZ && !freq_is_chan_12_13_14(beacon_chan->center_freq))) return 0; spin_lock_bh(&reg_pending_beacons_lock); processing = pending_reg_beacon(beacon_chan); spin_unlock_bh(&reg_pending_beacons_lock); if (processing) return 0; reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); if (!reg_beacon) return -ENOMEM; pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", beacon_chan->center_freq, beacon_chan->freq_offset, ieee80211_freq_khz_to_channel( ieee80211_channel_to_khz(beacon_chan)), wiphy_name(wiphy)); memcpy(&reg_beacon->chan, beacon_chan, sizeof(struct ieee80211_channel)); /* * Since we can be called from BH or and non-BH context * we must use spin_lock_bh() */ spin_lock_bh(&reg_pending_beacons_lock); list_add_tail(&reg_beacon->list, &reg_pending_beacons); spin_unlock_bh(&reg_pending_beacons_lock); schedule_work(&reg_work); return 0; } static void print_rd_rules(const struct ieee80211_regdomain *rd) { unsigned int i; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; const struct ieee80211_power_rule *power_rule = NULL; char bw[32], cac_time[32]; pr_debug(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n"); for (i = 0; i < rd->n_reg_rules; i++) { reg_rule = &rd->reg_rules[i]; freq_range = &reg_rule->freq_range; power_rule = &reg_rule->power_rule; if (reg_rule->flags & NL80211_RRF_AUTO_BW) snprintf(bw, sizeof(bw), "%d KHz, %u KHz AUTO", freq_range->max_bandwidth_khz, reg_get_max_bandwidth(rd, reg_rule)); else snprintf(bw, sizeof(bw), "%d KHz", freq_range->max_bandwidth_khz); if (reg_rule->flags & NL80211_RRF_DFS) scnprintf(cac_time, sizeof(cac_time), "%u s", reg_rule->dfs_cac_ms/1000); else scnprintf(cac_time, sizeof(cac_time), "N/A"); /* * There may not be documentation for max antenna gain * in certain regions */ if (power_rule->max_antenna_gain) pr_debug(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, power_rule->max_antenna_gain, power_rule->max_eirp, cac_time); else pr_debug(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, power_rule->max_eirp, cac_time); } } bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region) { switch (dfs_region) { case NL80211_DFS_UNSET: case NL80211_DFS_FCC: case NL80211_DFS_ETSI: case NL80211_DFS_JP: return true; default: pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region); return false; } } static void print_regdomain(const struct ieee80211_regdomain *rd) { struct regulatory_request *lr = get_last_request(); if (is_intersected_alpha2(rd->alpha2)) { if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) { struct cfg80211_registered_device *rdev; rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx); if (rdev) { pr_debug("Current regulatory domain updated by AP to: %c%c\n", rdev->country_ie_alpha2[0], rdev->country_ie_alpha2[1]); } else pr_debug("Current regulatory domain intersected:\n"); } else pr_debug("Current regulatory domain intersected:\n"); } else if (is_world_regdom(rd->alpha2)) { pr_debug("World regulatory domain updated:\n"); } else { if (is_unknown_alpha2(rd->alpha2)) pr_debug("Regulatory domain changed to driver built-in settings (unknown country)\n"); else { if (reg_request_cell_base(lr)) pr_debug("Regulatory domain changed to country: %c%c by Cell Station\n", rd->alpha2[0], rd->alpha2[1]); else pr_debug("Regulatory domain changed to country: %c%c\n", rd->alpha2[0], rd->alpha2[1]); } } pr_debug(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region)); print_rd_rules(rd); } static void print_regdomain_info(const struct ieee80211_regdomain *rd) { pr_debug("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_rd_rules(rd); } static int reg_set_rd_core(const struct ieee80211_regdomain *rd) { if (!is_world_regdom(rd->alpha2)) return -EINVAL; update_world_regdomain(rd); return 0; } static int reg_set_rd_user(const struct ieee80211_regdomain *rd, struct regulatory_request *user_request) { const struct ieee80211_regdomain *intersected_rd = NULL; if (!regdom_changes(rd->alpha2)) return -EALREADY; if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } if (!user_request->intersect) { reset_regdomains(false, rd); return 0; } intersected_rd = regdom_intersect(rd, get_cfg80211_regdom()); if (!intersected_rd) return -EINVAL; kfree(rd); rd = NULL; reset_regdomains(false, intersected_rd); return 0; } static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, struct regulatory_request *driver_request) { const struct ieee80211_regdomain *regd; const struct ieee80211_regdomain *intersected_rd = NULL; const struct ieee80211_regdomain *tmp; struct wiphy *request_wiphy; if (is_world_regdom(rd->alpha2)) return -EINVAL; if (!regdom_changes(rd->alpha2)) return -EALREADY; if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx); if (!request_wiphy) return -ENODEV; if (!driver_request->intersect) { ASSERT_RTNL(); wiphy_lock(request_wiphy); if (request_wiphy->regd) { wiphy_unlock(request_wiphy); return -EALREADY; } regd = reg_copy_regd(rd); if (IS_ERR(regd)) { wiphy_unlock(request_wiphy); return PTR_ERR(regd); } rcu_assign_pointer(request_wiphy->regd, regd); wiphy_unlock(request_wiphy); reset_regdomains(false, rd); return 0; } intersected_rd = regdom_intersect(rd, get_cfg80211_regdom()); if (!intersected_rd) return -EINVAL; /* * We can trash what CRDA provided now. * However if a driver requested this specific regulatory * domain we keep it for its private use */ tmp = get_wiphy_regdom(request_wiphy); rcu_assign_pointer(request_wiphy->regd, rd); rcu_free_regdom(tmp); rd = NULL; reset_regdomains(false, intersected_rd); return 0; } static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, struct regulatory_request *country_ie_request) { struct wiphy *request_wiphy; if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) && !is_unknown_alpha2(rd->alpha2)) return -EINVAL; /* * Lets only bother proceeding on the same alpha2 if the current * rd is non static (it means CRDA was present and was used last) * and the pending request came in from a country IE */ if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx); if (!request_wiphy) return -ENODEV; if (country_ie_request->intersect) return -EINVAL; reset_regdomains(false, rd); return 0; } /* * Use this call to set the current regulatory domain. Conflicts with * multiple drivers can be ironed out later. Caller must've already * kmalloc'd the rd structure. */ int set_regdom(const struct ieee80211_regdomain *rd, enum ieee80211_regd_source regd_src) { struct regulatory_request *lr; bool user_reset = false; int r; if (IS_ERR_OR_NULL(rd)) return -ENODATA; if (!reg_is_valid_request(rd->alpha2)) { kfree(rd); return -EINVAL; } if (regd_src == REGD_SOURCE_CRDA) reset_crda_timeouts(); lr = get_last_request(); /* Note that this doesn't update the wiphys, this is done below */ switch (lr->initiator) { case NL80211_REGDOM_SET_BY_CORE: r = reg_set_rd_core(rd); break; case NL80211_REGDOM_SET_BY_USER: cfg80211_save_user_regdom(rd); r = reg_set_rd_user(rd, lr); user_reset = true; break; case NL80211_REGDOM_SET_BY_DRIVER: r = reg_set_rd_driver(rd, lr); break; case NL80211_REGDOM_SET_BY_COUNTRY_IE: r = reg_set_rd_country_ie(rd, lr); break; default: WARN(1, "invalid initiator %d\n", lr->initiator); kfree(rd); return -EINVAL; } if (r) { switch (r) { case -EALREADY: reg_set_request_processed(); break; default: /* Back to world regulatory in case of errors */ restore_regulatory_settings(user_reset, false); } kfree(rd); return r; } /* This would make this whole thing pointless */ if (WARN_ON(!lr->intersect && rd != get_cfg80211_regdom())) return -EINVAL; /* update all wiphys now with the new established regulatory domain */ update_all_wiphy_regulatory(lr->initiator); print_regdomain(get_cfg80211_regdom()); nl80211_send_reg_change_event(lr); reg_set_request_processed(); return 0; } static int __regulatory_set_wiphy_regd(struct wiphy *wiphy, struct ieee80211_regdomain *rd) { const struct ieee80211_regdomain *regd; const struct ieee80211_regdomain *prev_regd; struct cfg80211_registered_device *rdev; if (WARN_ON(!wiphy || !rd)) return -EINVAL; if (WARN(!(wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED), "wiphy should have REGULATORY_WIPHY_SELF_MANAGED\n")) return -EPERM; if (WARN(!is_valid_rd(rd), "Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1])) { print_regdomain_info(rd); return -EINVAL; } regd = reg_copy_regd(rd); if (IS_ERR(regd)) return PTR_ERR(regd); rdev = wiphy_to_rdev(wiphy); spin_lock(&reg_requests_lock); prev_regd = rdev->requested_regd; rdev->requested_regd = regd; spin_unlock(&reg_requests_lock); kfree(prev_regd); return 0; } int regulatory_set_wiphy_regd(struct wiphy *wiphy, struct ieee80211_regdomain *rd) { int ret = __regulatory_set_wiphy_regd(wiphy, rd); if (ret) return ret; schedule_work(&reg_work); return 0; } EXPORT_SYMBOL(regulatory_set_wiphy_regd); int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy, struct ieee80211_regdomain *rd) { int ret; ASSERT_RTNL(); ret = __regulatory_set_wiphy_regd(wiphy, rd); if (ret) return ret; /* process the request immediately */ reg_process_self_managed_hint(wiphy); reg_check_channels(); return 0; } EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync); void wiphy_regulatory_register(struct wiphy *wiphy) { struct regulatory_request *lr = get_last_request(); /* self-managed devices ignore beacon hints and country IE */ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS | REGULATORY_COUNTRY_IE_IGNORE; /* * The last request may have been received before this * registration call. Call the driver notifier if * initiator is USER. */ if (lr->initiator == NL80211_REGDOM_SET_BY_USER) reg_call_notifier(wiphy, lr); } if (!reg_dev_ignore_cell_hint(wiphy)) reg_num_devs_support_basehint++; wiphy_update_regulatory(wiphy, lr->initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_process_self_managed_hints(); } void wiphy_regulatory_deregister(struct wiphy *wiphy) { struct wiphy *request_wiphy = NULL; struct regulatory_request *lr; lr = get_last_request(); if (!reg_dev_ignore_cell_hint(wiphy)) reg_num_devs_support_basehint--; rcu_free_regdom(get_wiphy_regdom(wiphy)); RCU_INIT_POINTER(wiphy->regd, NULL); if (lr) request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (!request_wiphy || request_wiphy != wiphy) return; lr->wiphy_idx = WIPHY_IDX_INVALID; lr->country_ie_env = ENVIRON_ANY; } /* * See FCC notices for UNII band definitions * 5GHz: https://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii * 6GHz: https://www.fcc.gov/document/fcc-proposes-more-spectrum-unlicensed-use-0 */ int cfg80211_get_unii(int freq) { /* UNII-1 */ if (freq >= 5150 && freq <= 5250) return 0; /* UNII-2A */ if (freq > 5250 && freq <= 5350) return 1; /* UNII-2B */ if (freq > 5350 && freq <= 5470) return 2; /* UNII-2C */ if (freq > 5470 && freq <= 5725) return 3; /* UNII-3 */ if (freq > 5725 && freq <= 5825) return 4; /* UNII-5 */ if (freq > 5925 && freq <= 6425) return 5; /* UNII-6 */ if (freq > 6425 && freq <= 6525) return 6; /* UNII-7 */ if (freq > 6525 && freq <= 6875) return 7; /* UNII-8 */ if (freq > 6875 && freq <= 7125) return 8; return -EINVAL; } bool regulatory_indoor_allowed(void) { return reg_is_indoor; } bool regulatory_pre_cac_allowed(struct wiphy *wiphy) { const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; bool pre_cac_allowed = false; rcu_read_lock(); regd = rcu_dereference(cfg80211_regdomain); wiphy_regd = rcu_dereference(wiphy->regd); if (!wiphy_regd) { if (regd->dfs_region == NL80211_DFS_ETSI) pre_cac_allowed = true; rcu_read_unlock(); return pre_cac_allowed; } if (regd->dfs_region == wiphy_regd->dfs_region && wiphy_regd->dfs_region == NL80211_DFS_ETSI) pre_cac_allowed = true; rcu_read_unlock(); return pre_cac_allowed; } EXPORT_SYMBOL(regulatory_pre_cac_allowed); static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; /* If we finished CAC or received radar, we should end any * CAC running on the same channels. * the check !cfg80211_chandef_dfs_usable contain 2 options: * either all channels are available - those the CAC_FINISHED * event has effected another wdev state, or there is a channel * in unavailable state in wdev chandef - those the RADAR_DETECTED * event has effected another wdev state. * In both cases we should end the CAC on the wdev. */ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { struct cfg80211_chan_def *chandef; if (!wdev->cac_started) continue; /* FIXME: radar detection is tied to link 0 for now */ chandef = wdev_chandef(wdev, 0); if (!chandef) continue; if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef)) rdev_end_cac(rdev, wdev->netdev); } } void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state, enum nl80211_radar_event event) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); if (WARN_ON(!cfg80211_chandef_valid(chandef))) return; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (wiphy == &rdev->wiphy) continue; if (!reg_dfs_domain_same(wiphy, &rdev->wiphy)) continue; if (!ieee80211_get_channel(&rdev->wiphy, chandef->chan->center_freq)) continue; cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state); if (event == NL80211_RADAR_DETECTED || event == NL80211_RADAR_CAC_FINISHED) { cfg80211_sched_dfs_chan_update(rdev); cfg80211_check_and_end_cac(rdev); } nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL); } } static int __init regulatory_init_db(void) { int err; /* * It's possible that - due to other bugs/issues - cfg80211 * never called regulatory_init() below, or that it failed; * in that case, don't try to do any further work here as * it's doomed to lead to crashes. */ if (IS_ERR_OR_NULL(reg_pdev)) return -EINVAL; err = load_builtin_regdb_keys(); if (err) { platform_device_unregister(reg_pdev); return err; } /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { if (err == -ENOMEM) { platform_device_unregister(reg_pdev); return err; } /* * N.B. kobject_uevent_env() can fail mainly for when we're out * memory which is handled and propagated appropriately above * but it can also fail during a netlink_broadcast() or during * early boot for call_usermodehelper(). For now treat these * errors as non-fatal. */ pr_err("kobject_uevent_env() was unable to call CRDA during init\n"); } /* * Finally, if the user set the module parameter treat it * as a user hint. */ if (!is_world_regdom(ieee80211_regdom)) regulatory_hint_user(ieee80211_regdom, NL80211_USER_REG_HINT_USER); return 0; } #ifndef MODULE late_initcall(regulatory_init_db); #endif int __init regulatory_init(void) { reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); user_alpha2[0] = '9'; user_alpha2[1] = '7'; #ifdef MODULE return regulatory_init_db(); #else return 0; #endif } void regulatory_exit(void) { struct regulatory_request *reg_request, *tmp; struct reg_beacon *reg_beacon, *btmp; cancel_work_sync(&reg_work); cancel_crda_timeout_sync(); cancel_delayed_work_sync(&reg_check_chans); /* Lock to suppress warnings */ rtnl_lock(); reset_regdomains(true, NULL); rtnl_unlock(); dev_set_uevent_suppress(&reg_pdev->dev, true); platform_device_unregister(reg_pdev); list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) { list_del(&reg_beacon->list); kfree(reg_beacon); } list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) { list_del(&reg_beacon->list); kfree(reg_beacon); } list_for_each_entry_safe(reg_request, tmp, &reg_requests_list, list) { list_del(&reg_request->list); kfree(reg_request); } if (!IS_ERR_OR_NULL(regdb)) kfree(regdb); if (!IS_ERR_OR_NULL(cfg80211_user_regdom)) kfree(cfg80211_user_regdom); free_regdb_keyring(); }
2 2 2 2 2 836 7 831 557 486 836 3 3 7 4 7 7 3 4 7 7 7 3 3 3 3 3 7 7 7 7 7 2 1 1 1 1 1 2 1 1 1 1 1 1 6 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 // SPDX-License-Identifier: GPL-2.0-only /* * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules * * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project * * Authors * Thomas Graf <tgraf@suug.ch> * Ville Nuorvala <vnuorval@tcs.hut.fi> */ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/export.h> #include <linux/indirect_call_wrapper.h> #include <net/fib_rules.h> #include <net/inet_dscp.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/netlink.h> struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; dscp_t dscp; }; static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); if (r->dst.plen || r->src.plen || r->dscp) return false; return fib_rule_matchall(rule); } bool fib6_rule_default(const struct fib_rule *rule) { if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || rule->l3mdev) return false; if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN) return false; return true; } EXPORT_SYMBOL_GPL(fib6_rule_default); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, AF_INET6); } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { int err; if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = fib6_table_lookup, .lookup_data = &oif, .result = res, .flags = FIB_LOOKUP_NOREF, }; l3mdev_update_flow(net, flowi6_to_flowi(fl6)); err = fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); } else { err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif, fl6, res, flags); if (err || res->f6i == net->ipv6.fib6_null_entry) err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } return err; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib6_result res = {}; struct fib_lookup_arg arg = { .lookup_ptr = lookup, .lookup_data = skb, .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(fl6)); fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); if (res.rt6) return &res.rt6->dst; } else { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); } if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, struct flowi6 *flp6, const struct net_device *dev) { struct fib6_rule *r = (struct fib6_rule *)rule; /* If we need to find a source address for this traffic, * we check the result if it meets requirement of the rule. */ if ((rule->flags & FIB_RULE_FIND_SADDR) && r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { struct in6_addr saddr; if (ipv6_dev_get_saddr(net, dev, &flp6->daddr, rt6_flags2srcprefs(flags), &saddr)) return -EAGAIN; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) return -EAGAIN; flp6->saddr = saddr; } return 0; } static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct net *net = rule->fr_net; struct fib6_table *table; int err, *oif; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } tb_id = fib_rule_get_table(rule, arg); table = fib6_get_table(net, tb_id); if (!table) return -EAGAIN; oif = (int *)arg->lookup_data; err = fib6_table_lookup(net, table, *oif, flp6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) err = fib6_rule_saddr(net, rule, flags, flp6, res->nh->fib_nh_dev); else err = -EAGAIN; return err; } static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; int err = 0; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: err = -ENETUNREACH; rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: err = -EINVAL; rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: err = -EACCES; rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } tb_id = fib_rule_get_table(rule, arg); table = fib6_get_table(net, tb_id); if (!table) { err = -EAGAIN; goto out; } rt = pol_lookup_func(lookup, net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { err = fib6_rule_saddr(net, rule, flags, flp6, ip6_dst_idev(&rt->dst)->dev); if (err == -EAGAIN) goto again; err = rt->dst.error; if (err != -EAGAIN) goto out; } again: ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); out: res->rt6 = rt; return err; } INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { if (arg->lookup_ptr == fib6_table_lookup) return fib6_rule_action_alt(rule, flp, flags, arg); return __fib6_rule_action(rule, flp, flags, arg); } INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct rt6_info *rt = res->rt6; struct net_device *dev = NULL; if (!rt) return false; if (rt->rt6i_idev) dev = rt->rt6i_idev->dev; /* do not accept result if the route does * not meet the required prefix length */ if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: ip6_rt_put_flags(rt, flags); return true; } INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; struct flowi6 *fl6 = &fl->u.ip6; if (r->dst.plen && !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) return 0; /* * If FIB_RULE_FIND_SADDR is set and we do not have a * source address for the traffic, we defer check for * source address. */ if (r->src.plen) { if (flags & RT6_LOOKUP_F_HAS_SADDR) { if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, r->src.plen)) return 0; } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) return 0; if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) return 0; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) return 0; return 1; } static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { int err = -EINVAL; struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); goto errout; } if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; goto errout; } } if (frh->src_len) rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]); if (frh->dst_len) rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]); rule6->src.plen = frh->src_len; rule6->dst.plen = frh->dst_len; if (fib_rule_requires_fldissect(rule)) net->ipv6.fib6_rules_require_fldissect++; net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; } static int fib6_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; if (net->ipv6.fib6_rules_require_fldissect && fib_rule_requires_fldissect(rule)) net->ipv6.fib6_rules_require_fldissect--; return 0; } static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (frh->src_len && (rule6->src.plen != frh->src_len)) return 0; if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos) return 0; if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; if (frh->dst_len && nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr))) return 0; return 1; } static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; frh->tos = inet_dscp_to_dsfield(rule6->dscp); if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr))) goto nla_put_failure; return 0; nla_put_failure: return -ENOBUFS; } static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ + nla_total_size(16); /* src */ } static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .delete = fib6_rule_delete, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, .fro_net = &init_net, }; static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; int err; ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0); if (err) goto out_fib6_rules_ops; err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0); if (err) goto out_fib6_rules_ops; net->ipv6.fib6_rules_ops = ops; net->ipv6.fib6_rules_require_fldissect = 0; out: return err; out_fib6_rules_ops: fib_rules_unregister(ops); goto out; } static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { fib_rules_unregister(net->ipv6.fib6_rules_ops); cond_resched(); } rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit_batch = fib6_rules_net_exit_batch, }; int __init fib6_rules_init(void) { return register_pernet_subsys(&fib6_rules_net_ops); } void fib6_rules_cleanup(void) { unregister_pernet_subsys(&fib6_rules_net_ops); }
2777 7740 15 7746 1424 26 1424 1898 6837 219 6839 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PERCPU_RWSEM_H #define _LINUX_PERCPU_RWSEM_H #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/rcuwait.h> #include <linux/wait.h> #include <linux/rcu_sync.h> #include <linux/lockdep.h> struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *read_count; struct rcuwait writer; wait_queue_head_t waiters; atomic_t block; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }, #else #define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) #endif #define __DEFINE_PERCPU_RWSEM(name, is_static) \ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ is_static struct percpu_rw_semaphore name = { \ .rss = __RCU_SYNC_INITIALIZER(name.rss), \ .read_count = &__percpu_rwsem_rc_##name, \ .writer = __RCUWAIT_INITIALIZER(name.writer), \ .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters), \ .block = ATOMIC_INIT(0), \ __PERCPU_RWSEM_DEP_MAP_INIT(name) \ } #define DEFINE_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, /* not static */) #define DEFINE_STATIC_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, static) extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); static inline void percpu_down_read(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); preempt_disable(); /* * We are in an RCU-sched read-side critical section, so the writer * cannot both change sem->state from readers_fast and start checking * counters while we are here. So if we see !sem->state, we know that * the writer won't be checking until we're past the preempt_enable() * and that once the synchronize_rcu() is done, the writer will see * anything we did within this RCU-sched read-size critical section. */ if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else __percpu_down_read(sem, false); /* Unconditional memory barrier */ /* * The preempt_enable() prevents the compiler from * bleeding the critical section out. */ preempt_enable(); } static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { bool ret = true; preempt_disable(); /* * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from * bleeding the critical section out. */ if (ret) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } static inline void percpu_up_read(struct percpu_rw_semaphore *sem) { rwsem_release(&sem->dep_map, _RET_IP_); preempt_disable(); /* * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) { this_cpu_dec(*sem->read_count); } else { /* * slowpath; reader will only ever wake a single blocked * writer. */ smp_mb(); /* B matches C */ /* * In other words, if they see our decrement (presumably to * aggregate zero, as that is the only time it matters) they * will also see our critical section. */ this_cpu_dec(*sem->read_count); rcuwait_wake_up(&sem->writer); } preempt_enable(); } extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) { return atomic_read(&sem->block); } extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) #define percpu_rwsem_is_held(sem) lockdep_is_held(sem) #define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, bool read, unsigned long ip) { lock_release(&sem->dep_map, ip); } static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, bool read, unsigned long ip) { lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip); } #endif
478 19 1 378 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions implement the SCTP primitive functions from Section 10. * * Note that the descriptions from the specification are USER level * functions--this file is the functions which populate the struct proto * for SCTP which is the BOTTOM of the sockets interface. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Narasimha Budihal <narasimha@refcode.org> * Karl Knutson <karl@athena.chicago.il.us> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/gfp.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #define DECLARE_PRIMITIVE(name) \ /* This is called in the code as sctp_primitive_ ## name. */ \ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \ void *arg) { \ int error = 0; \ enum sctp_event_type event_type; union sctp_subtype subtype; \ enum sctp_state state; \ struct sctp_endpoint *ep; \ \ event_type = SCTP_EVENT_T_PRIMITIVE; \ subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \ state = asoc ? asoc->state : SCTP_STATE_CLOSED; \ ep = asoc ? asoc->ep : NULL; \ \ error = sctp_do_sm(net, event_type, subtype, state, ep, asoc, \ arg, GFP_KERNEL); \ return error; \ } /* 10.1 ULP-to-SCTP * B) Associate * * Format: ASSOCIATE(local SCTP instance name, destination transport addr, * outbound stream count) * -> association id [,destination transport addr list] [,outbound stream * count] * * This primitive allows the upper layer to initiate an association to a * specific peer endpoint. * * This version assumes that asoc is fully populated with the initial * parameters. We then return a traditional kernel indicator of * success or failure. */ /* This is called in the code as sctp_primitive_ASSOCIATE. */ DECLARE_PRIMITIVE(ASSOCIATE) /* 10.1 ULP-to-SCTP * C) Shutdown * * Format: SHUTDOWN(association id) * -> result * * Gracefully closes an association. Any locally queued user data * will be delivered to the peer. The association will be terminated only * after the peer acknowledges all the SCTP packets sent. A success code * will be returned on successful termination of the association. If * attempting to terminate the association results in a failure, an error * code shall be returned. */ DECLARE_PRIMITIVE(SHUTDOWN); /* 10.1 ULP-to-SCTP * C) Abort * * Format: Abort(association id [, cause code]) * -> result * * Ungracefully closes an association. Any locally queued user data * will be discarded and an ABORT chunk is sent to the peer. A success * code will be returned on successful abortion of the association. If * attempting to abort the association results in a failure, an error * code shall be returned. */ DECLARE_PRIMITIVE(ABORT); /* 10.1 ULP-to-SCTP * E) Send * * Format: SEND(association id, buffer address, byte count [,context] * [,stream id] [,life time] [,destination transport address] * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) * -> result * * This is the main method to send user data via SCTP. * * Mandatory attributes: * * o association id - local handle to the SCTP association * * o buffer address - the location where the user message to be * transmitted is stored; * * o byte count - The size of the user data in number of bytes; * * Optional attributes: * * o context - an optional 32 bit integer that will be carried in the * sending failure notification to the ULP if the transportation of * this User Message fails. * * o stream id - to indicate which stream to send the data on. If not * specified, stream 0 will be used. * * o life time - specifies the life time of the user data. The user data * will not be sent by SCTP after the life time expires. This * parameter can be used to avoid efforts to transmit stale * user messages. SCTP notifies the ULP if the data cannot be * initiated to transport (i.e. sent to the destination via SCTP's * send primitive) within the life time variable. However, the * user data will be transmitted if SCTP has attempted to transmit a * chunk before the life time expired. * * o destination transport address - specified as one of the destination * transport addresses of the peer endpoint to which this packet * should be sent. Whenever possible, SCTP should use this destination * transport address for sending the packets, instead of the current * primary path. * * o unorder flag - this flag, if present, indicates that the user * would like the data delivered in an unordered fashion to the peer * (i.e., the U flag is set to 1 on all DATA chunks carrying this * message). * * o no-bundle flag - instructs SCTP not to bundle this user data with * other outbound DATA chunks. SCTP MAY still bundle even when * this flag is present, when faced with network congestion. * * o payload protocol-id - A 32 bit unsigned integer that is to be * passed to the peer indicating the type of payload protocol data * being transmitted. This value is passed as opaque data by SCTP. */ DECLARE_PRIMITIVE(SEND); /* 10.1 ULP-to-SCTP * J) Request Heartbeat * * Format: REQUESTHEARTBEAT(association id, destination transport address) * * -> result * * Instructs the local endpoint to perform a HeartBeat on the specified * destination transport address of the given association. The returned * result should indicate whether the transmission of the HEARTBEAT * chunk to the destination address is successful. * * Mandatory attributes: * * o association id - local handle to the SCTP association * * o destination transport address - the transport address of the * association on which a heartbeat should be issued. */ DECLARE_PRIMITIVE(REQUESTHEARTBEAT); /* ADDIP * 3.1.1 Address Configuration Change Chunk (ASCONF) * * This chunk is used to communicate to the remote endpoint one of the * configuration change requests that MUST be acknowledged. The * information carried in the ASCONF Chunk uses the form of a * Type-Length-Value (TLV), as described in "3.2.1 Optional/ * Variable-length Parameter Format" in RFC2960 [5], forall variable * parameters. */ DECLARE_PRIMITIVE(ASCONF); /* RE-CONFIG 5.1 */ DECLARE_PRIMITIVE(RECONF);
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 /* SPDX-License-Identifier: GPL-2.0 */ /* * IRQ subsystem internal functions and variables: * * Do not ever include this file from anything else than * kernel/irq/. Do not even think about using any information outside * of this file for your non core code. */ #include <linux/irqdesc.h> #include <linux/kernel_stat.h> #include <linux/pm_runtime.h> #include <linux/sched/clock.h> #ifdef CONFIG_SPARSE_IRQ # define MAX_SPARSE_IRQS INT_MAX #else # define MAX_SPARSE_IRQS NR_IRQS #endif #define istate core_internal_state__do_not_mess_with_it extern bool noirqdebug; extern struct irqaction chained_action; /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded * IRQTF_READY - signals that irq thread is ready */ enum { IRQTF_RUNTHREAD, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, IRQTF_READY, }; /* * Bit masks for desc->core_internal_state__do_not_mess_with_it * * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq has been resent and will not be resent * again until the handler has run and cleared * this flag. * IRQS_WAITING - irq is waiting * IRQS_PENDING - irq needs to be resent and should be resent * at the next available opportunity. * IRQS_SUSPENDED - irq is suspended * IRQS_NMI - irq line is used to deliver NMIs * IRQS_SYSFS - descriptor has been added to sysfs */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, IRQS_NMI = 0x00002000, IRQS_SYSFS = 0x00004000, }; #include "debug.h" #include "settings.h" extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); extern void __disable_irq(struct irq_desc *desc); extern void __enable_irq(struct irq_desc *desc); #define IRQ_RESEND true #define IRQ_NORESEND false #define IRQ_START_FORCE true #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif extern int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ int check_irq_resend(struct irq_desc *desc, bool inject); void clear_irq_resend(struct irq_desc *desc); void irq_resend_init(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); void wake_threads_waitq(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif extern bool irq_can_set_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); #ifdef CONFIG_SMP extern int irq_setup_affinity(struct irq_desc *desc); #else static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } #endif /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { if (unlikely(desc->irq_data.chip->irq_bus_lock)) desc->irq_data.chip->irq_bus_lock(&desc->irq_data); } static inline void chip_bus_sync_unlock(struct irq_desc *desc) { if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } #define _IRQ_DESC_CHECK (1 << 0) #define _IRQ_DESC_PERCPU (1 << 1) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) #define for_each_action_of_desc(desc, act) \ for (act = desc->action; act; act = act->next) struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); static inline struct irq_desc * irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) { return __irq_get_desc_lock(irq, flags, true, check); } static inline void irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) { __irq_put_desc_unlock(desc, flags, true); } static inline struct irq_desc * irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) { return __irq_get_desc_lock(irq, flags, false, check); } static inline void irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) { __irq_put_desc_unlock(desc, flags, false); } #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline unsigned int irqd_get(struct irq_data *d) { return __irqd_to_state(d); } /* * Manipulation functions for irq_data.state */ static inline void irqd_set_move_pending(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING; } static inline void irqd_clr_move_pending(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; } static inline void irqd_set_managed_shutdown(struct irq_data *d) { __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN; } static inline void irqd_clr_managed_shutdown(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN; } static inline void irqd_clear(struct irq_data *d, unsigned int mask) { __irqd_to_state(d) &= ~mask; } static inline void irqd_set(struct irq_data *d, unsigned int mask) { __irqd_to_state(d) |= mask; } static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { return __irqd_to_state(d) & mask; } static inline void irq_state_set_disabled(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); } static inline void irq_state_set_masked(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } #undef __irqd_to_state static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __kstat_incr_irqs_this_cpu(desc); desc->tot_count++; } static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); } static inline int irq_desc_is_chained(struct irq_desc *desc) { return (desc->action && desc->action == &chained_action); } #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif #ifdef CONFIG_IRQ_TIMINGS #define IRQ_TIMINGS_SHIFT 5 #define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT) #define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1) /** * struct irq_timings - irq timings storing structure * @values: a circular buffer of u64 encoded <timestamp,irq> values * @count: the number of elements in the array */ struct irq_timings { u64 values[IRQ_TIMINGS_SIZE]; int count; }; DECLARE_PER_CPU(struct irq_timings, irq_timings); extern void irq_timings_free(int irq); extern int irq_timings_alloc(int irq); static inline void irq_remove_timings(struct irq_desc *desc) { desc->istate &= ~IRQS_TIMINGS; irq_timings_free(irq_desc_get_irq(desc)); } static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) { int irq = irq_desc_get_irq(desc); int ret; /* * We don't need the measurement because the idle code already * knows the next expiry event. */ if (act->flags & __IRQF_TIMER) return; /* * In case the timing allocation fails, we just want to warn, * not fail, so letting the system boot anyway. */ ret = irq_timings_alloc(irq); if (ret) { pr_warn("Failed to allocate irq timing stats for irq%d (%d)", irq, ret); return; } desc->istate |= IRQS_TIMINGS; } extern void irq_timings_enable(void); extern void irq_timings_disable(void); DECLARE_STATIC_KEY_FALSE(irq_timing_enabled); /* * The interrupt number and the timestamp are encoded into a single * u64 variable to optimize the size. * 48 bit time stamp and 16 bit IRQ number is way sufficient. * Who cares an IRQ after 78 hours of idle time? */ static inline u64 irq_timing_encode(u64 timestamp, int irq) { return (timestamp << 16) | irq; } static inline int irq_timing_decode(u64 value, u64 *timestamp) { *timestamp = value >> 16; return value & U16_MAX; } static __always_inline void irq_timings_push(u64 ts, int irq) { struct irq_timings *timings = this_cpu_ptr(&irq_timings); timings->values[timings->count & IRQ_TIMINGS_MASK] = irq_timing_encode(ts, irq); timings->count++; } /* * The function record_irq_time is only called in one place in the * interrupts handler. We want this function always inline so the code * inside is embedded in the function and the static key branching * code can act at the higher level. Without the explicit * __always_inline we can end up with a function call and a small * overhead in the hotpath for nothing. */ static __always_inline void record_irq_time(struct irq_desc *desc) { if (!static_branch_likely(&irq_timing_enabled)) return; if (desc->istate & IRQS_TIMINGS) irq_timings_push(local_clock(), irq_desc_get_irq(desc)); } #else static inline void irq_remove_timings(struct irq_desc *desc) {} static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) {}; static inline void record_irq_time(struct irq_desc *desc) {} #endif /* CONFIG_IRQ_TIMINGS */ #ifdef CONFIG_GENERIC_IRQ_CHIP void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); #else static inline void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ #ifdef CONFIG_GENERIC_PENDING_IRQ static inline bool irq_can_move_pcntxt(struct irq_data *data) { return irqd_can_move_in_process_context(data); } static inline bool irq_move_pending(struct irq_data *data) { return irqd_is_setaffinity_pending(data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { cpumask_copy(desc->pending_mask, mask); } static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { cpumask_copy(mask, desc->pending_mask); } static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } static inline bool handle_enforce_irqctx(struct irq_data *data) { return irqd_is_handle_enforce_irqctx(data); } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } static inline bool irq_move_pending(struct irq_data *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return NULL; } static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } static inline bool handle_enforce_irqctx(struct irq_data *data) { return false; } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; } static inline void irq_domain_deactivate_irq(struct irq_data *data) { irqd_clr_activated(data); } #endif static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd) { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY return irqd->parent_data; #else return NULL; #endif } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include <linux/debugfs.h> void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); static inline void irq_remove_debugfs_entry(struct irq_desc *desc) { debugfs_remove(desc->debugfs_file); kfree(desc->dev_name); } void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else static inline void irq_domain_debugfs_init(struct dentry *root) { } # endif #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) { } static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } static inline void irq_debugfs_copy_devname(int irq, struct device *dev) { } #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
16945 16949 16950 11299 11306 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 // SPDX-License-Identifier: GPL-2.0 /* * trace context switch * * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> * */ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <trace/events/sched.h> #include "trace.h" #define RECORD_CMDLINE 1 #define RECORD_TGID 2 static int sched_cmdline_ref; static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) { int ret; ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { unregister_trace_sched_switch(probe_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(int ops) { bool sched_register; mutex_lock(&sched_register_mutex); sched_register = (!sched_cmdline_ref && !sched_tgid_ref); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref++; break; case RECORD_TGID: sched_tgid_ref++; break; } if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref--; break; case RECORD_TGID: sched_tgid_ref--; break; } if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(RECORD_CMDLINE); } void tracing_start_tgid_record(void) { tracing_start_sched_switch(RECORD_TGID); } void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); }
3014 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 // SPDX-License-Identifier: GPL-2.0-only /* * scsi.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * Copyright (C) 2002, 2003 Christoph Hellwig * * generic mid-level SCSI driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * * <drew@colorado.edu> * * Bug correction thanks go to : * Rik Faith <faith@cs.unc.edu> * Tommy Thorn <tthorn> * Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de> * * Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to * add scatter-gather, multiple outstanding request, and other * enhancements. * * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer <mike@i-connect.net> * * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf) * Bjorn Ekwall <bj0rn@blox.se> * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff <lnz@dandelion.com> * * Converted cli() code to spinlocks, Ingo Molnar * * Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli * * out_of_space hacks, D. Gilbert (dpg) 990608 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/unistd.h> #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "scsi_priv.h" #include "scsi_logging.h" #define CREATE_TRACE_POINTS #include <trace/events/scsi.h> /* * Definitions and constants. */ /* * Note - the initial logging level can be set here to log events at boot time. * After the system is up, you may enable logging via the /proc interface. */ unsigned int scsi_logging_level; #if defined(CONFIG_SCSI_LOGGING) EXPORT_SYMBOL(scsi_logging_level); #endif #ifdef CONFIG_SCSI_LOGGING void scsi_log_send(struct scsi_cmnd *cmd) { unsigned int level; /* * If ML QUEUE log level is greater than or equal to: * * 1: nothing (match completion) * * 2: log opcode + command of all commands + cmd address * * 3: same as 2 * * 4: same as 3 */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT, SCSI_LOG_MLQUEUE_BITS); if (level > 1) { scmd_printk(KERN_INFO, cmd, "Send: scmd 0x%p\n", cmd); scsi_print_command(cmd); } } } void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) { unsigned int level; /* * If ML COMPLETE log level is greater than or equal to: * * 1: log disposition, result, opcode + command, and conditionally * sense data for failures or non SUCCESS dispositions. * * 2: same as 1 but for all command completions. * * 3: same as 2 * * 4: same as 3 plus dump extra junk */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); if (((level > 0) && (cmd->result || disposition != SUCCESS)) || (level > 1)) { scsi_print_result(cmd, "Done", disposition); scsi_print_command(cmd); if (scsi_status_is_check_condition(cmd->result)) scsi_print_sense(cmd); if (level > 3) scmd_printk(KERN_INFO, cmd, "scsi host busy %d failed %d\n", scsi_host_busy(cmd->device->host), cmd->device->host->host_failed); } } } #endif /** * scsi_finish_command - cleanup and pass command back to upper layer * @cmd: the command * * Description: Pass command off to upper layer for finishing of I/O * request, waking processes that are waiting on results, * etc. */ void scsi_finish_command(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; struct scsi_driver *drv; unsigned int good_bytes; scsi_device_unbusy(sdev, cmd); /* * Clear the flags that say that the device/target/host is no longer * capable of accepting new commands. */ if (atomic_read(&shost->host_blocked)) atomic_set(&shost->host_blocked, 0); if (atomic_read(&starget->target_blocked)) atomic_set(&starget->target_blocked, 0); if (atomic_read(&sdev->device_blocked)) atomic_set(&sdev->device_blocked, 0); SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev, "Notifying upper driver of completion " "(result %x)\n", cmd->result)); good_bytes = scsi_bufflen(cmd); if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { int old_good_bytes = good_bytes; drv = scsi_cmd_to_driver(cmd); if (drv->done) good_bytes = drv->done(cmd); /* * USB may not give sense identifying bad sector and * simply return a residue instead, so subtract off the * residue if drv->done() error processing indicates no * change to the completion length. */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); } scsi_io_completion(cmd, good_bytes); } /* * 4096 is big enough for saturating fast SCSI LUNs. */ int scsi_device_max_queue_depth(struct scsi_device *sdev) { return min_t(int, sdev->host->can_queue, 4096); } /** * scsi_change_queue_depth - change a device's queue depth * @sdev: SCSI Device in question * @depth: number of commands allowed to be queued to the driver * * Sets the device queue depth and returns the new value. */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { depth = min_t(int, depth, scsi_device_max_queue_depth(sdev)); if (depth > 0) { sdev->queue_depth = depth; wmb(); } if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); sbitmap_resize(&sdev->budget_map, sdev->queue_depth); return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); /** * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth * @sdev: SCSI Device in question * @depth: Current number of outstanding SCSI commands on this device, * not counting the one returned as QUEUE_FULL. * * Description: This function will track successive QUEUE_FULL events on a * specific SCSI device to determine if and when there is a * need to adjust the queue depth on the device. * * Returns: 0 - No change needed, >0 - Adjust queue depth to this new depth, * -1 - Drop back to untagged operation using host->cmd_per_lun * as the untagged command depth * * Lock Status: None held on entry * * Notes: Low level drivers may call this at any time and we will do * "The Right Thing." We are interrupt context safe. */ int scsi_track_queue_full(struct scsi_device *sdev, int depth) { /* * Don't let QUEUE_FULLs on the same * jiffies count, they could all be from * same event. */ if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4)) return 0; sdev->last_queue_full_time = jiffies; if (sdev->last_queue_full_depth != depth) { sdev->last_queue_full_count = 1; sdev->last_queue_full_depth = depth; } else { sdev->last_queue_full_count++; } if (sdev->last_queue_full_count <= 10) return 0; return scsi_change_queue_depth(sdev, depth); } EXPORT_SYMBOL(scsi_track_queue_full); /** * scsi_vpd_inquiry - Request a device provide us with a VPD page * @sdev: The device to ask * @buffer: Where to put the result * @page: Which Vital Product Data to return * @len: The length of the buffer * * This is an internal helper function. You probably want to use * scsi_get_vpd_page instead. * * Returns size of the vpd page on success or a negative error number. */ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, u8 page, unsigned len) { int result; unsigned char cmd[16]; if (len < 4) return -EINVAL; cmd[0] = INQUIRY; cmd[1] = 1; /* EVPD */ cmd[2] = page; cmd[3] = len >> 8; cmd[4] = len & 0xff; cmd[5] = 0; /* Control byte */ /* * I'm not convinced we need to try quite this hard to get VPD, but * all the existing users tried this hard. */ result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, 30 * HZ, 3, NULL); if (result) return -EIO; /* * Sanity check that we got the page back that we asked for and that * the page size is not 0. */ if (buffer[1] != page) return -EIO; result = get_unaligned_be16(&buffer[2]); if (!result) return -EIO; return result + 4; } static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) { unsigned char vpd_header[SCSI_VPD_HEADER_SIZE] __aligned(4); int result; if (sdev->no_vpd_size) return SCSI_DEFAULT_VPD_LEN; /* * Fetch the VPD page header to find out how big the page * is. This is done to prevent problems on legacy devices * which can not handle allocation lengths as large as * potentially requested by the caller. */ result = scsi_vpd_inquiry(sdev, vpd_header, page, sizeof(vpd_header)); if (result < 0) return 0; if (result < SCSI_VPD_HEADER_SIZE) { dev_warn_once(&sdev->sdev_gendev, "%s: short VPD page 0x%02x length: %d bytes\n", __func__, page, result); return 0; } return result; } /** * scsi_get_vpd_page - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * @buf: where to store the VPD * @buf_len: number of bytes in the VPD buffer area * * SCSI devices may optionally supply Vital Product Data. Each 'page' * of VPD is defined in the appropriate SCSI document (eg SPC, SBC). * If the device supports this VPD page, this routine fills @buf * with the data from that page and return 0. If the VPD page is not * supported or its content cannot be retrieved, -EINVAL is returned. */ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, int buf_len) { int result, vpd_len; if (!scsi_device_supports_vpd(sdev)) return -EINVAL; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return -EINVAL; vpd_len = min(vpd_len, buf_len); /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ memset(buf, 0, buf_len); result = scsi_vpd_inquiry(sdev, buf, page, vpd_len); if (result < 0) return -EINVAL; else if (result > vpd_len) dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); return 0; } EXPORT_SYMBOL_GPL(scsi_get_vpd_page); /** * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * * Returns %NULL upon failure. */ static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page) { struct scsi_vpd *vpd_buf; int vpd_len, result; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return NULL; retry_pg: /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL); if (!vpd_buf) return NULL; result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len); if (result < 0) { kfree(vpd_buf); return NULL; } if (result > vpd_len) { dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); vpd_len = result; kfree(vpd_buf); goto retry_pg; } vpd_buf->len = result; return vpd_buf; } static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, struct scsi_vpd __rcu **sdev_vpd_buf) { struct scsi_vpd *vpd_buf; vpd_buf = scsi_get_vpd_buf(sdev, page); if (!vpd_buf) return; mutex_lock(&sdev->inquiry_mutex); vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) kfree_rcu(vpd_buf, rcu); } /** * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure * @sdev: The device to ask * * Attach the 'Device Identification' VPD page (0x83) and the * 'Unit Serial Number' VPD page (0x80) to a SCSI device * structure. This information can be used to identify the device * uniquely. */ void scsi_attach_vpd(struct scsi_device *sdev) { int i; struct scsi_vpd *vpd_buf; if (!scsi_device_supports_vpd(sdev)) return; /* Ask for all the pages supported by this device */ vpd_buf = scsi_get_vpd_buf(sdev, 0); if (!vpd_buf) return; for (i = 4; i < vpd_buf->len; i++) { if (vpd_buf->data[i] == 0x0) scsi_update_vpd_page(sdev, 0x0, &sdev->vpd_pg0); if (vpd_buf->data[i] == 0x80) scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80); if (vpd_buf->data[i] == 0x83) scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83); if (vpd_buf->data[i] == 0x89) scsi_update_vpd_page(sdev, 0x89, &sdev->vpd_pg89); if (vpd_buf->data[i] == 0xb0) scsi_update_vpd_page(sdev, 0xb0, &sdev->vpd_pgb0); if (vpd_buf->data[i] == 0xb1) scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1); if (vpd_buf->data[i] == 0xb2) scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2); } kfree(vpd_buf); } /** * scsi_report_opcode - Find out if a given command is supported * @sdev: scsi device to query * @buffer: scratch buffer (must be at least 20 bytes long) * @len: length of buffer * @opcode: opcode for the command to look up * @sa: service action for the command to look up * * Uses the REPORT SUPPORTED OPERATION CODES to check support for the * command identified with @opcode and @sa. If the command does not * have a service action, @sa must be 0. Returns -EINVAL if RSOC fails, * 0 if the command is not supported and 1 if the device claims to * support the command. */ int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int result, request_len; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3) return -EINVAL; /* RSOC header + size of command we are asking about */ request_len = 4 + COMMAND_SIZE(opcode); if (request_len > len) { dev_warn_once(&sdev->sdev_gendev, "%s: len %u bytes, opcode 0x%02x needs %u\n", __func__, len, opcode, request_len); return -EINVAL; } memset(cmd, 0, 16); cmd[0] = MAINTENANCE_IN; cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES; if (!sa) { cmd[2] = 1; /* One command format */ cmd[3] = opcode; } else { cmd[2] = 3; /* One command format with service action */ cmd[3] = opcode; put_unaligned_be16(sa, &cmd[4]); } put_unaligned_be32(request_len, &cmd[6]); memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, request_len, 30 * HZ, 3, &exec_args); if (result < 0) return result; if (result && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) return -EINVAL; if ((buffer[1] & 3) == 3) /* Command supported */ return 1; return 0; } EXPORT_SYMBOL(scsi_report_opcode); #define SCSI_CDL_CHECK_BUF_LEN 64 static bool scsi_cdl_check_cmd(struct scsi_device *sdev, u8 opcode, u16 sa, unsigned char *buf) { int ret; u8 cdlp; /* Check operation code */ ret = scsi_report_opcode(sdev, buf, SCSI_CDL_CHECK_BUF_LEN, opcode, sa); if (ret <= 0) return false; if ((buf[1] & 0x03) != 0x03) return false; /* * See SPC-6, One_command parameter data format for * REPORT SUPPORTED OPERATION CODES. We have the following cases * depending on rwcdlp (buf[0] & 0x01) value: * - rwcdlp == 0: then cdlp indicates support for the A mode page when * it is equal to 1 and for the B mode page when it is * equal to 2. * - rwcdlp == 1: then cdlp indicates support for the T2A mode page * when it is equal to 1 and for the T2B mode page when * it is equal to 2. * Overall, to detect support for command duration limits, we only need * to check that cdlp is 1 or 2. */ cdlp = (buf[1] & 0x18) >> 3; return cdlp == 0x01 || cdlp == 0x02; } /** * scsi_cdl_check - Check if a SCSI device supports Command Duration Limits * @sdev: The device to check */ void scsi_cdl_check(struct scsi_device *sdev) { bool cdl_supported; unsigned char *buf; /* * Support for CDL was defined in SPC-5. Ignore devices reporting an * lower SPC version. This also avoids problems with old drives choking * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a * service action specified, as done in scsi_cdl_check_cmd(). */ if (sdev->scsi_level < SCSI_SPC_5) { sdev->cdl_supported = 0; return; } buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL); if (!buf) { sdev->cdl_supported = 0; return; } /* Check support for READ_16, WRITE_16, READ_32 and WRITE_32 commands */ cdl_supported = scsi_cdl_check_cmd(sdev, READ_16, 0, buf) || scsi_cdl_check_cmd(sdev, WRITE_16, 0, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, READ_32, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, WRITE_32, buf); if (cdl_supported) { /* * We have CDL support: force the use of READ16/WRITE16. * READ32 and WRITE32 will be used for devices that support * the T10_PI_TYPE2_PROTECTION protection type. */ sdev->use_16_for_rw = 1; sdev->use_10_for_rw = 0; sdev->cdl_supported = 1; } else { sdev->cdl_supported = 0; } kfree(buf); } /** * scsi_cdl_enable - Enable or disable a SCSI device supports for Command * Duration Limits * @sdev: The target device * @enable: the target state */ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) { struct scsi_mode_data data; struct scsi_sense_hdr sshdr; struct scsi_vpd *vpd; bool is_ata = false; char buf[64]; int ret; if (!sdev->cdl_supported) return -EOPNOTSUPP; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg89); if (vpd) is_ata = true; rcu_read_unlock(); /* * For ATA devices, CDL needs to be enabled with a SET FEATURES command. */ if (is_ata) { char *buf_data; int len; ret = scsi_mode_sense(sdev, 0x08, 0x0a, 0xf2, buf, sizeof(buf), 5 * HZ, 3, &data, NULL); if (ret) return -EINVAL; /* Enable CDL using the ATA feature page */ len = min_t(size_t, sizeof(buf), data.length - data.header_length - data.block_descriptor_length); buf_data = buf + data.header_length + data.block_descriptor_length; if (enable) buf_data[4] = 0x02; else buf_data[4] = 0; ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3, &data, &sshdr); if (ret) { if (scsi_sense_valid(&sshdr)) scsi_print_sense_hdr(sdev, dev_name(&sdev->sdev_gendev), &sshdr); return ret; } } sdev->cdl_enable = enable; return 0; } /** * scsi_device_get - get an additional reference to a scsi_device * @sdev: device to get a reference to * * Description: Gets a reference to the scsi_device and increments the use count * of the underlying LLDD module. You must hold host_lock of the * parent Scsi_Host or already have a reference when calling this. * * This will fail if a device is deleted or cancelled, or when the LLD module * is in the process of being unloaded. */ int scsi_device_get(struct scsi_device *sdev) { if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) goto fail; if (!try_module_get(sdev->host->hostt->module)) goto fail; if (!get_device(&sdev->sdev_gendev)) goto fail_put_module; return 0; fail_put_module: module_put(sdev->host->hostt->module); fail: return -ENXIO; } EXPORT_SYMBOL(scsi_device_get); /** * scsi_device_put - release a reference to a scsi_device * @sdev: device to release a reference on. * * Description: Release a reference to the scsi_device and decrements the use * count of the underlying LLDD module. The device is freed once the last * user vanishes. */ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; put_device(&sdev->sdev_gendev); module_put(mod); } EXPORT_SYMBOL(scsi_device_put); /* helper for shost_for_each_device, see that for documentation */ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost, struct scsi_device *prev) { struct list_head *list = (prev ? &prev->siblings : &shost->__devices); struct scsi_device *next = NULL; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); while (list->next != &shost->__devices) { next = list_entry(list->next, struct scsi_device, siblings); /* skip devices that we can't get a reference to */ if (!scsi_device_get(next)) break; next = NULL; list = list->next; } spin_unlock_irqrestore(shost->host_lock, flags); if (prev) scsi_device_put(prev); return next; } EXPORT_SYMBOL(__scsi_iterate_devices); /** * starget_for_each_device - helper to walk all devices of a target * @starget: target whose devices we want to iterate over. * @data: Opaque passed to each function call. * @fn: Function to call on each device * * This traverses over each device of @starget. The devices have * a reference that must be released by scsi_host_put when breaking * out of the loop. */ void starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(starget_for_each_device); /** * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED) * @starget: target whose devices we want to iterate over. * @data: parameter for callback @fn() * @fn: callback function that is invoked for each device * * This traverses over each device of @starget. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use starget_for_each_device instead. **/ void __starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(__starget_for_each_device); /** * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED) * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and * any access to the returned scsi_device. A scsi_device in state * SDEV_DEL is skipped. * * Note: The only reason why drivers should use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup_by_target instead. **/ struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &starget->devices, same_target_siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup_by_target); /** * scsi_device_lookup_by_target - find a device given the target * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup_by_target(starget, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup_by_target); /** * __scsi_device_lookup - find a device given the host (UNLOCKED) * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and any access * to the returned scsi_device. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup instead. **/ struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->channel == channel && sdev->id == id && sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup); /** * scsi_device_lookup - find a device given the host * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup(shost, channel, id, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup); MODULE_DESCRIPTION("SCSI core"); MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); static int __init init_scsi(void) { int error; error = scsi_init_procfs(); if (error) goto cleanup_queue; error = scsi_init_devinfo(); if (error) goto cleanup_procfs; error = scsi_init_hosts(); if (error) goto cleanup_devlist; error = scsi_init_sysctl(); if (error) goto cleanup_hosts; error = scsi_sysfs_register(); if (error) goto cleanup_sysctl; scsi_netlink_init(); printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0; cleanup_sysctl: scsi_exit_sysctl(); cleanup_hosts: scsi_exit_hosts(); cleanup_devlist: scsi_exit_devinfo(); cleanup_procfs: scsi_exit_procfs(); cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error; } static void __exit exit_scsi(void) { scsi_netlink_exit(); scsi_sysfs_unregister(); scsi_exit_sysctl(); scsi_exit_hosts(); scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); } subsys_initcall(init_scsi); module_exit(exit_scsi);
105 136 103 35 136 136 104 35 93 93 91 15 64 53 123 119 18 18 1 18 32 33 4 32 32 33 93 93 12 12 11 11 12 119 100 19 19 71 101 123 32 32 32 32 122 123 93 85 19 64 64 43 11 11 6 93 93 131 131 118 131 75 75 14 14 18 79 3 2 126 126 113 72 14 18 15 14 1 15 14 317 126 72 316 46 1 46 30 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> * * Thanks to Ingo Molnar for his many suggestions. * * Authors: Alan Stern <stern@rowland.harvard.edu> * K.Prasad <prasad@linux.vnet.ibm.com> * Frederic Weisbecker <fweisbec@gmail.com> */ /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. * This file contains the arch-independent routines. */ #include <linux/hw_breakpoint.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/percpu-rwsem.h> #include <linux/percpu.h> #include <linux/rhashtable.h> #include <linux/sched.h> #include <linux/slab.h> /* * Datastructure to track the total uses of N slots across tasks or CPUs; * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. */ struct bp_slots_histogram { #ifdef hw_breakpoint_slots atomic_t count[hw_breakpoint_slots(0)]; #else atomic_t *count; #endif }; /* * Per-CPU constraints data. */ struct bp_cpuinfo { /* Number of pinned CPU breakpoints in a CPU. */ unsigned int cpu_pinned; /* Histogram of pinned task breakpoints in a CPU. */ struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } /* Number of pinned CPU breakpoints globally. */ static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; /* Number of pinned CPU-independent task breakpoints. */ static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; /* Keep track of the breakpoints attached to tasks */ static struct rhltable task_bps_ht; static const struct rhashtable_params task_bps_ht_params = { .head_offset = offsetof(struct hw_perf_event, bp_list), .key_offset = offsetof(struct hw_perf_event, target), .key_len = sizeof_field(struct hw_perf_event, target), .automatic_shrinking = true, }; static bool constraints_initialized __ro_after_init; /* * Synchronizes accesses to the per-CPU constraints; the locking rules are: * * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock * (due to bp_slots_histogram::count being atomic, no update are lost). * * 2. Holding a write-lock is required for computations that require a * stable snapshot of all bp_cpuinfo::tsk_pinned. * * 3. In all other cases, non-atomic accesses require the appropriately held * lock (read-lock for read-only accesses; write-lock for reads/writes). */ DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); /* * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since * rhltable synchronizes concurrent insertions/deletions, independent tasks may * insert/delete concurrently; therefore, a mutex per task is sufficient. * * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is * that hw_breakpoint may contend with per-task perf event list management. The * assumption is that perf usecases involving hw_breakpoints are very unlikely * to result in unnecessary contention. */ static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) { struct task_struct *tsk = bp->hw.target; return tsk ? &tsk->perf_event_mutex : NULL; } static struct mutex *bp_constraints_lock(struct perf_event *bp) { struct mutex *tsk_mtx = get_task_bps_mutex(bp); if (tsk_mtx) { /* * Fully analogous to the perf_try_init_event() nesting * argument in the comment near perf_event_ctx_lock_nested(); * this child->perf_event_mutex cannot ever deadlock against * the parent->perf_event_mutex usage from * perf_event_task_{en,dis}able(). * * Specifically, inherited events will never occur on * ->perf_event_list. */ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); percpu_down_read(&bp_cpuinfo_sem); } else { percpu_down_write(&bp_cpuinfo_sem); } return tsk_mtx; } static void bp_constraints_unlock(struct mutex *tsk_mtx) { if (tsk_mtx) { percpu_up_read(&bp_cpuinfo_sem); mutex_unlock(tsk_mtx); } else { percpu_up_write(&bp_cpuinfo_sem); } } static bool bp_constraints_is_locked(struct perf_event *bp) { struct mutex *tsk_mtx = get_task_bps_mutex(bp); return percpu_is_write_locked(&bp_cpuinfo_sem) || (tsk_mtx ? mutex_is_locked(tsk_mtx) : percpu_is_read_locked(&bp_cpuinfo_sem)); } static inline void assert_bp_constraints_lock_held(struct perf_event *bp) { struct mutex *tsk_mtx = get_task_bps_mutex(bp); if (tsk_mtx) lockdep_assert_held(tsk_mtx); lockdep_assert_held(&bp_cpuinfo_sem); } #ifdef hw_breakpoint_slots /* * Number of breakpoint slots is constant, and the same for all types. */ static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } static inline int init_breakpoint_slots(void) { return 0; } #else /* * Dynamic number of breakpoint slots. */ static int __nr_bp_slots[TYPE_MAX] __ro_after_init; static inline int hw_breakpoint_slots_cached(int type) { return __nr_bp_slots[type]; } static __init bool bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) { hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); return hist->count; } static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) { kfree(hist->count); } static __init int init_breakpoint_slots(void) { int i, cpu, err_cpu; for (i = 0; i < TYPE_MAX; i++) __nr_bp_slots[i] = hw_breakpoint_slots(i); for_each_possible_cpu(cpu) { for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) goto err; } } for (i = 0; i < TYPE_MAX; i++) { if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) goto err; if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) goto err; } return 0; err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } for (i = 0; i < TYPE_MAX; i++) { bp_slots_histogram_free(&cpu_pinned[i]); bp_slots_histogram_free(&tsk_pinned_all[i]); } return -ENOMEM; } #endif static inline void bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) { const int old_idx = old - 1; const int new_idx = old_idx + val; if (old_idx >= 0) WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); if (new_idx >= 0) WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); } static int bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) { for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { const int count = atomic_read(&hist->count[i]); /* Catch unexpected writers; we want a stable snapshot. */ ASSERT_EXCLUSIVE_WRITER(hist->count[i]); if (count > 0) return i + 1; WARN(count < 0, "inconsistent breakpoint slots histogram"); } return 0; } static int bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, enum bp_type_idx type) { for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { const int count1 = atomic_read(&hist1->count[i]); const int count2 = atomic_read(&hist2->count[i]); /* Catch unexpected writers; we want a stable snapshot. */ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); if (count1 + count2 > 0) return i + 1; WARN(count1 < 0, "inconsistent breakpoint slots histogram"); WARN(count2 < 0, "inconsistent breakpoint slots histogram"); } return 0; } #ifndef hw_breakpoint_weight static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } #endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; } /* * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; /* * At this point we want to have acquired the bp_cpuinfo_sem as a * writer to ensure that there are no concurrent writers in * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. */ lockdep_assert_held_write(&bp_cpuinfo_sem); return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. * * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; /* * We need a stable snapshot of the per-task breakpoint list. */ assert_bp_constraints_lock_held(bp); rcu_read_lock(); head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); if (!head) goto out; rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { if (find_slot_idx(iter->attr.bp_type) != type) continue; if (iter->cpu >= 0) { if (cpu == -1) { count = -1; goto out; } else if (cpu != iter->cpu) continue; } count += hw_breakpoint_weight(iter); } out: rcu_read_unlock(); return count; } static const struct cpumask *cpumask_of_bp(struct perf_event *bp) { if (bp->cpu >= 0) return cpumask_of(bp->cpu); return cpu_possible_mask; } /* * Returns the max pinned breakpoint slots in a given * CPU (cpu > -1) or across all of them (cpu = -1). */ static int max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); int pinned_slots = 0; int cpu; if (bp->hw.target && bp->cpu < 0) { int max_pinned = task_bp_pinned(-1, bp, type); if (max_pinned >= 0) { /* * Fast path: task_bp_pinned() is CPU-independent and * returns the same value for any CPU. */ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); return max_pinned; } } for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; nr = info->cpu_pinned; if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); pinned_slots = max(nr, pinned_slots); } return pinned_slots; } /* * Add/remove the given breakpoint in our constraint table */ static int toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu, next_tsk_pinned; if (!enable) weight = -weight; if (!bp->hw.target) { /* * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the * global histogram. */ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); lockdep_assert_held_write(&bp_cpuinfo_sem); bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); info->cpu_pinned += weight; return 0; } /* * If bp->hw.target, tsk_pinned is only modified, but not used * otherwise. We can permit concurrent updates as long as there are no * other uses: having acquired bp_cpuinfo_sem as a reader allows * concurrent updates here. Uses of tsk_pinned will require acquiring * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. */ lockdep_assert_held_read(&bp_cpuinfo_sem); /* * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global * histogram. We need to take care of 4 cases: * * 1. This breakpoint targets all CPUs (cpu < 0), and there may only * exist other task breakpoints targeting all CPUs. In this case we * can simply update the global slots histogram. * * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may * only exist other task breakpoints targeting all CPUs. * * a. On enable: remove the existing breakpoints from the global * slots histogram and use the per-CPU histogram. * * b. On disable: re-insert the existing breakpoints into the global * slots histogram and remove from per-CPU histogram. * * 3. Some other existing task breakpoints target specific CPUs. Only * update the per-CPU slots histogram. */ if (!enable) { /* * Remove before updating histograms so we can determine if this * was the last task breakpoint for a specific CPU. */ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); if (ret) return ret; } /* * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. */ next_tsk_pinned = task_bp_pinned(-1, bp, type); if (next_tsk_pinned >= 0) { if (bp->cpu < 0) { /* Case 1: fast path */ if (!enable) next_tsk_pinned += hw_breakpoint_weight(bp); bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); } else if (enable) { /* Case 2.a: slow path */ /* Add existing to per-CPU histograms. */ for_each_possible_cpu(cpu) { bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, 0, next_tsk_pinned); } /* Add this first CPU-pinned task breakpoint. */ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, next_tsk_pinned, weight); /* Rebalance global task pinned histogram. */ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, -next_tsk_pinned); } else { /* Case 2.b: slow path */ /* Remove this last CPU-pinned task breakpoint. */ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, next_tsk_pinned + hw_breakpoint_weight(bp), weight); /* Remove all from per-CPU histograms. */ for_each_possible_cpu(cpu) { bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, next_tsk_pinned, -next_tsk_pinned); } /* Rebalance global task pinned histogram. */ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); } } else { /* Case 3: slow path */ const struct cpumask *cpumask = cpumask_of_bp(bp); for_each_cpu(cpu, cpumask) { next_tsk_pinned = task_bp_pinned(cpu, bp, type); if (!enable) next_tsk_pinned += hw_breakpoint_weight(bp); bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, next_tsk_pinned, weight); } } /* * Readers want a stable snapshot of the per-task breakpoint list. */ assert_bp_constraints_lock_held(bp); if (enable) return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); return 0; } /* * Constraints to check before allowing this new breakpoint counter. * * Note: Flexible breakpoints are currently unimplemented, but outlined in the * below algorithm for completeness. The implementation treats flexible as * pinned due to no guarantee that we currently always schedule flexible events * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * * - If attached to a single cpu, check: * * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. * Otherwise, we check that the maximum number of per task * breakpoints (for this cpu) plus the number of per cpu breakpoint * (for this cpu) doesn't cover every registers. * * - If attached to every cpus, check: * * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks * breakpoints. * * * == Pinned counter == * * - If attached to a single cpu, check: * * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the info->flexible, if any, must keep * one register at least (or they will never be fed). * * - If attached to every cpus, check: * * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int max_pinned_slots; int weight; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) return -ENOMEM; /* Basic checks */ if (bp_type == HW_BREAKPOINT_EMPTY || bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); /* Check if this new breakpoint can be satisfied across all CPUs. */ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) { struct mutex *mtx = bp_constraints_lock(bp); int ret = __reserve_bp_slot(bp, bp->attr.bp_type); bp_constraints_unlock(mtx); return ret; } static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) { struct mutex *mtx = bp_constraints_lock(bp); __release_bp_slot(bp, bp->attr.bp_type); bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int err; __release_bp_slot(bp, old_type); err = __reserve_bp_slot(bp, new_type); if (err) { /* * Reserve the old_type slot back in case * there's no space for the new type. * * This must succeed, because we just released * the old_type slot in the __release_bp_slot * call above. If not, something is broken. */ WARN_ON(__reserve_bp_slot(bp, old_type)); } return err; } static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { struct mutex *mtx = bp_constraints_lock(bp); int ret = __modify_bp_slot(bp, old_type, new_type); bp_constraints_unlock(mtx); return ret; } /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and * release breakpoint slots. */ int dbg_reserve_bp_slot(struct perf_event *bp) { int ret; if (bp_constraints_is_locked(bp)) return -1; /* Locks aren't held; disable lockdep assert checking. */ lockdep_off(); ret = __reserve_bp_slot(bp, bp->attr.bp_type); lockdep_on(); return ret; } int dbg_release_bp_slot(struct perf_event *bp) { if (bp_constraints_is_locked(bp)) return -1; /* Locks aren't held; disable lockdep assert checking. */ lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); lockdep_on(); return 0; } static int hw_breakpoint_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { int err; err = hw_breakpoint_arch_parse(bp, attr, hw); if (err) return err; if (arch_check_bp_in_kernelspace(hw)) { if (attr->exclude_kernel) return -EINVAL; /* * Don't let unprivileged users set a breakpoint in the trap * path to avoid trap recursion attacks. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return 0; } int register_perf_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint hw = { }; int err; err = reserve_bp_slot(bp); if (err) return err; err = hw_breakpoint_parse(bp, &bp->attr, &hw); if (err) { release_bp_slot(bp); return err; } bp->hw.info = hw; return 0; } /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @context: context data could be used in the triggered callback * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk, triggered, context); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); static void hw_breakpoint_copy_attr(struct perf_event_attr *to, struct perf_event_attr *from) { to->bp_addr = from->bp_addr; to->bp_type = from->bp_type; to->bp_len = from->bp_len; to->disabled = from->disabled; } int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { struct arch_hw_breakpoint hw = { }; int err; err = hw_breakpoint_parse(bp, attr, &hw); if (err) return err; if (check) { struct perf_event_attr old_attr; old_attr = bp->attr; hw_breakpoint_copy_attr(&old_attr, attr); if (memcmp(&old_attr, attr, sizeof(*attr))) return -EINVAL; } if (bp->attr.bp_type != attr->bp_type) { err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type); if (err) return err; } hw_breakpoint_copy_attr(&bp->attr, attr); bp->hw.info = hw; return 0; } /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. * So call the function directly after making sure we are targeting the * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) perf_event_disable_local(bp); else perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, false); if (!bp->attr.disabled) perf_event_enable(bp); return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); /** * unregister_hw_breakpoint - unregister a user-space hardware breakpoint * @bp: the breakpoint structure to unregister */ void unregister_hw_breakpoint(struct perf_event *bp) { if (!bp) return; perf_event_release_kernel(bp); } EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); /** * register_wide_hw_breakpoint - register a wide breakpoint in the kernel * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @context: context data could be used in the triggered callback * * @return a set of per_cpu pointers to perf events */ struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { struct perf_event * __percpu *cpu_events, *bp; long err = 0; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) return (void __percpu __force *)ERR_PTR(-ENOMEM); cpus_read_lock(); for_each_online_cpu(cpu) { bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); if (IS_ERR(bp)) { err = PTR_ERR(bp); break; } per_cpu(*cpu_events, cpu) = bp; } cpus_read_unlock(); if (likely(!err)) return cpu_events; unregister_wide_hw_breakpoint(cpu_events); return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); /** * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; for_each_possible_cpu(cpu) unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); free_percpu(cpu_events); } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); /** * hw_breakpoint_is_used - check if breakpoints are currently used * * Returns: true if breakpoints are used, false otherwise. */ bool hw_breakpoint_is_used(void) { int cpu; if (!constraints_initialized) return false; for_each_possible_cpu(cpu) { for (int type = 0; type < TYPE_MAX; ++type) { struct bp_cpuinfo *info = get_bp_info(cpu, type); if (info->cpu_pinned) return true; for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { if (atomic_read(&info->tsk_pinned.count[slot])) return true; } } } for (int type = 0; type < TYPE_MAX; ++type) { for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { /* * Warn, because if there are CPU pinned counters, * should never get here; bp_cpuinfo::cpu_pinned should * be consistent with the global cpu_pinned histogram. */ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) return true; if (atomic_read(&tsk_pinned_all[type].count[slot])) return true; } } return false; } static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ .priority = 0x7fffffff }; static void bp_perf_event_destroy(struct perf_event *event) { release_bp_slot(event); } static int hw_breakpoint_event_init(struct perf_event *bp) { int err; if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; /* * no branch sampling for breakpoint events */ if (has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); if (err) return err; bp->destroy = bp_perf_event_destroy; return 0; } static int hw_breakpoint_add(struct perf_event *bp, int flags) { if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; if (is_sampling_event(bp)) { bp->hw.last_period = bp->hw.sample_period; perf_swevent_set_period(bp); } return arch_install_hw_breakpoint(bp); } static void hw_breakpoint_del(struct perf_event *bp, int flags) { arch_uninstall_hw_breakpoint(bp); } static void hw_breakpoint_start(struct perf_event *bp, int flags) { bp->hw.state = 0; } static void hw_breakpoint_stop(struct perf_event *bp, int flags) { bp->hw.state = PERF_HES_STOPPED; } static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ .event_init = hw_breakpoint_event_init, .add = hw_breakpoint_add, .del = hw_breakpoint_del, .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, }; int __init init_hw_breakpoint(void) { int ret; ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); if (ret) return ret; ret = init_breakpoint_slots(); if (ret) return ret; constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_CORE_DEV_H #define _NET_CORE_DEV_H #include <linux/types.h> struct net; struct net_device; struct netdev_bpf; struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { u64 count; unsigned int num_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; }; extern int netdev_flow_limit_table_len; #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else #define dev_proc_init() 0 #endif void linkwatch_init_dev(struct net_device *dev); void linkwatch_forget_dev(struct net_device *dev); void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern int netdev_budget; extern unsigned int netdev_budget_usecs; extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); /* netdev management, shared between various uAPI entry points */ struct netdev_name_node { struct hlist_node hlist; struct list_head list; struct net_device *dev; const char *name; }; int netdev_get_name(struct net *net, char *name, int ifindex); int dev_change_name(struct net_device *dev, const char *newname); int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); int dev_set_mtu_ext(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value); typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); void dev_set_group(struct net_device *dev, int new_group); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh); void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh); static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); if (size <= GSO_LEGACY_MAX_SIZE) WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, unsigned int segs) { /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_segs, segs); } static inline void netif_set_gro_max_size(struct net_device *dev, unsigned int size) { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_max_size, size); if (size <= GRO_LEGACY_MAX_SIZE) WRITE_ONCE(dev->gro_ipv4_max_size, size); } static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, unsigned int size) { /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, unsigned int size) { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_ipv4_max_size, size); } int rps_cpumask_housekeeping(struct cpumask *mask); #endif
1039 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 // SPDX-License-Identifier: GPL-2.0 /* * USB-ACPI glue code * * Copyright 2012 Red Hat <mjg@redhat.com> */ #include <linux/module.h> #include <linux/usb.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/pci.h> #include <linux/usb/hcd.h> #include "hub.h" /** * usb_acpi_power_manageable - check whether usb port has * acpi power resource. * @hdev: USB device belonging to the usb hub * @index: port index based zero * * Return true if the port has acpi power resource and false if no. */ bool usb_acpi_power_manageable(struct usb_device *hdev, int index) { acpi_handle port_handle; int port1 = index + 1; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (port_handle) return acpi_bus_power_manageable(port_handle); else return false; } EXPORT_SYMBOL_GPL(usb_acpi_power_manageable); #define UUID_USB_CONTROLLER_DSM "ce2ee385-00e6-48cb-9f05-2edb927c4899" #define USB_DSM_DISABLE_U1_U2_FOR_PORT 5 /** * usb_acpi_port_lpm_incapable - check if lpm should be disabled for a port. * @hdev: USB device belonging to the usb hub * @index: zero based port index * * Some USB3 ports may not support USB3 link power management U1/U2 states * due to different retimer setup. ACPI provides _DSM method which returns 0x01 * if U1 and U2 states should be disabled. Evaluate _DSM with: * Arg0: UUID = ce2ee385-00e6-48cb-9f05-2edb927c4899 * Arg1: Revision ID = 0 * Arg2: Function Index = 5 * Arg3: (empty) * * Return 1 if USB3 port is LPM incapable, negative on error, otherwise 0 */ int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index) { union acpi_object *obj; acpi_handle port_handle; int port1 = index + 1; guid_t guid; int ret; ret = guid_parse(UUID_USB_CONTROLLER_DSM, &guid); if (ret) return ret; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) { dev_dbg(&hdev->dev, "port-%d no acpi handle\n", port1); return -ENODEV; } if (!acpi_check_dsm(port_handle, &guid, 0, BIT(USB_DSM_DISABLE_U1_U2_FOR_PORT))) { dev_dbg(&hdev->dev, "port-%d no _DSM function %d\n", port1, USB_DSM_DISABLE_U1_U2_FOR_PORT); return -ENODEV; } obj = acpi_evaluate_dsm_typed(port_handle, &guid, 0, USB_DSM_DISABLE_U1_U2_FOR_PORT, NULL, ACPI_TYPE_INTEGER); if (!obj) { dev_dbg(&hdev->dev, "evaluate port-%d _DSM failed\n", port1); return -EINVAL; } if (obj->integer.value == 0x01) ret = 1; ACPI_FREE(obj); return ret; } EXPORT_SYMBOL_GPL(usb_acpi_port_lpm_incapable); /** * usb_acpi_set_power_state - control usb port's power via acpi power * resource * @hdev: USB device belonging to the usb hub * @index: port index based zero * @enable: power state expected to be set * * Notice to use usb_acpi_power_manageable() to check whether the usb port * has acpi power resource before invoking this function. * * Returns 0 on success, else negative errno. */ int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *port_dev; acpi_handle port_handle; unsigned char state; int port1 = index + 1; int error = -EINVAL; if (!hub) return -ENODEV; port_dev = hub->ports[port1 - 1]; port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) return error; if (enable) state = ACPI_STATE_D0; else state = ACPI_STATE_D3_COLD; error = acpi_bus_set_power(port_handle, state); if (!error) dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable); else dev_dbg(&port_dev->dev, "acpi: power failed to be set\n"); return error; } EXPORT_SYMBOL_GPL(usb_acpi_set_power_state); static enum usb_port_connect_type usb_acpi_get_connect_type(acpi_handle handle, struct acpi_pld_info *pld) { enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *upc = NULL; acpi_status status; /* * According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port * is user visible and _UPC indicates whether it is connectable. If * the port was visible and connectable, it could be freely connected * and disconnected with USB devices. If no visible and connectable, * a usb device is directly hard-wired to the port. If no visible and * no connectable, the port would be not used. */ status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer); if (ACPI_FAILURE(status)) goto out; upc = buffer.pointer; if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4) goto out; if (upc->package.elements[0].integer.value) if (pld->user_visible) connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG; else connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED; else if (!pld->user_visible) connect_type = USB_PORT_NOT_USED; out: kfree(upc); return connect_type; } /* * Private to usb-acpi, all the core needs to know is that * port_dev->location is non-zero when it has been set by the firmware. */ #define USB_ACPI_LOCATION_VALID (1 << 31) static struct acpi_device * usb_acpi_get_companion_for_port(struct usb_port *port_dev) { struct usb_device *udev; struct acpi_device *adev; acpi_handle *parent_handle; int port1; /* Get the struct usb_device point of port's hub */ udev = to_usb_device(port_dev->dev.parent->parent); /* * The root hub ports' parent is the root hub. The non-root-hub * ports' parent is the parent hub port which the hub is * connected to. */ if (!udev->parent) { adev = ACPI_COMPANION(&udev->dev); port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus), port_dev->portnum); } else { parent_handle = usb_get_hub_port_acpi_handle(udev->parent, udev->portnum); if (!parent_handle) return NULL; adev = acpi_fetch_acpi_dev(parent_handle); port1 = port_dev->portnum; } return acpi_find_child_by_adr(adev, port1); } static struct acpi_device * usb_acpi_find_companion_for_port(struct usb_port *port_dev) { struct acpi_device *adev; struct acpi_pld_info *pld; acpi_handle *handle; acpi_status status; adev = usb_acpi_get_companion_for_port(port_dev); if (!adev) return NULL; handle = adev->handle; status = acpi_get_physical_device_location(handle, &pld); if (ACPI_SUCCESS(status) && pld) { port_dev->location = USB_ACPI_LOCATION_VALID | pld->group_token << 8 | pld->group_position; port_dev->connect_type = usb_acpi_get_connect_type(handle, pld); ACPI_FREE(pld); } return adev; } static struct acpi_device * usb_acpi_find_companion_for_device(struct usb_device *udev) { struct acpi_device *adev; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent) { /* * root hub is only child (_ADR=0) under its parent, the HC. * sysdev pointer is the HC as seen from firmware. */ adev = ACPI_COMPANION(udev->bus->sysdev); return acpi_find_child_device(adev, 0, false); } hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return NULL; /* * This is an embedded USB device connected to a port and such * devices share port's ACPI companion. */ port_dev = hub->ports[udev->portnum - 1]; return usb_acpi_get_companion_for_port(port_dev); } static struct acpi_device *usb_acpi_find_companion(struct device *dev) { /* * The USB hierarchy like following: * * Device (EHC1) * Device (HUBN) * Device (PR01) * Device (PR11) * Device (PR12) * Device (FN12) * Device (FN13) * Device (PR13) * ... * where HUBN is root hub, and PRNN are USB ports and devices * connected to them, and FNNN are individualk functions for * connected composite USB devices. PRNN and FNNN may contain * _CRS and other methods describing sideband resources for * the connected device. * * On the kernel side both root hub and embedded USB devices are * represented as instances of usb_device structure, and ports * are represented as usb_port structures, so the whole process * is split into 2 parts: finding companions for devices and * finding companions for ports. * * Note that we do not handle individual functions of composite * devices yet, for that we would need to assign companions to * devices corresponding to USB interfaces. */ if (is_usb_device(dev)) return usb_acpi_find_companion_for_device(to_usb_device(dev)); else if (is_usb_port(dev)) return usb_acpi_find_companion_for_port(to_usb_port(dev)); return NULL; } static bool usb_acpi_bus_match(struct device *dev) { return is_usb_device(dev) || is_usb_port(dev); } static struct acpi_bus_type usb_acpi_bus = { .name = "USB", .match = usb_acpi_bus_match, .find_companion = usb_acpi_find_companion, }; int usb_acpi_register(void) { return register_acpi_bus_type(&usb_acpi_bus); } void usb_acpi_unregister(void) { unregister_acpi_bus_type(&usb_acpi_bus); }
4 1 9 2 9 1 1 1 9 9 2 7 7 7 6 4 2 2 1 1 1 1 112 3 3 3 112 112 112 112 112 111 111 66 110 111 66 65 54 11 13 9 9 9 9 9 9 111 6 6 113 6 118 1 4 3 3 2 2 2 4 1 1 1 5 5 9 8 3 8 7 6 5 5 2 2 2 5 11 3 3 2 2 2 2 11 21 11 1 65 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * Roger Venning <r.venning@telstra.com>: 6to4 support * Nate Thompson <nate@thebog.net>: 6to4 support * Fred Templin <fred.l.templin@boeing.com>: isatap support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip.h> #include <net/udp.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #define IP6_SIT_HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); static void ipip6_dev_free(struct net_device *dev); static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst); static struct rtnl_link_ops sit_link_ops __read_mostly; static unsigned int sit_net_id __read_mostly; struct sit_net { struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_wc[1]; struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; static inline struct sit_net *dev_to_sit_net(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); return net_generic(t->net, sit_net_id); } /* * Must be invoked with rcu_read_lock */ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, struct net_device *dev, __be32 remote, __be32 local, int sifindex) { unsigned int h0 = HASH(remote); unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); int ifindex = dev ? dev->ifindex : 0; for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(sitn->tunnels_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; unsigned int h = 0; int prio = 0; if (remote) { prio |= 2; h ^= HASH(remote); } if (local) { prio |= 1; h ^= HASH(local); } return &sitn->tunnels[prio][h]; } static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, struct ip_tunnel *t) { return __ipip6_bucket(sitn, &t->parms); } static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp; struct ip_tunnel *iter; for (tp = ipip6_bucket(sitn, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) { #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; t->ip6rd.relay_prefixlen = 0; } else { struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev); memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd)); } #endif } static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); int err; __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); if ((__force u16)t->parms.i_flags & SIT_ISATAP) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; err = register_netdevice(dev); if (err < 0) goto out; ipip6_tunnel_clone_6rd(dev, sitn); ipip6_tunnel_link(sitn, t); return 0; out: return err; } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; struct ip_tunnel *t, *nt; struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct sit_net *sitn = net_generic(net, sit_net_id); for (tp = __ipip6_bucket(sitn, parms); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && parms->link == t->parms.link) { if (create) return NULL; else return t; } } if (!create) goto failed; if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; strscpy(name, parms->name, IFNAMSIZ); } else { strcpy(name, "sit%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!dev) return NULL; dev_net_set(dev, net); nt = netdev_priv(dev); nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; if (!parms->name[0]) strcpy(parms->name, dev->name); return nt; failed_free: free_netdev(dev); failed: return NULL; } #define for_each_prl_rcu(start) \ for (prl = rcu_dereference(start); \ prl; \ prl = rcu_dereference(prl->next)) static struct ip_tunnel_prl_entry * __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) { struct ip_tunnel_prl_entry *prl; for_each_prl_rcu(t->prl) if (prl->addr == addr) break; return prl; } static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; unsigned int cmax, c = 0, ca, len; int ret = 0; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; if (copy_from_user(&kprl, a, sizeof(kprl))) return -EFAULT; cmax = kprl.datalen / sizeof(kprl); if (cmax > 1 && kprl.addr != htonl(INADDR_ANY)) cmax = 1; /* For simple GET or for root users, * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; ca = min(t->prl_count, cmax); if (!kp) { /* We don't try hard to allocate much memory for * non-root users. * For root users, retry allocating enough memory for * the answer. */ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; } } rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr) continue; kp[c].addr = prl->addr; kp[c].flags = prl->flags; c++; if (kprl.addr != htonl(INADDR_ANY)) break; } rcu_read_unlock(); len = sizeof(*kp) * c; ret = 0; if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen)) ret = -EFAULT; kfree(kp); out: return ret; } static int ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) { struct ip_tunnel_prl_entry *p; int err = 0; if (a->addr == htonl(INADDR_ANY)) return -EINVAL; ASSERT_RTNL(); for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { if (p->addr == a->addr) { if (chg) { p->flags = a->flags; goto out; } err = -EEXIST; goto out; } } if (chg) { err = -ENXIO; goto out; } p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); if (!p) { err = -ENOBUFS; goto out; } p->next = t->prl; p->addr = a->addr; p->flags = a->flags; t->prl_count++; rcu_assign_pointer(t->prl, p); out: return err; } static void prl_list_destroy_rcu(struct rcu_head *head) { struct ip_tunnel_prl_entry *p, *n; p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); do { n = rcu_dereference_protected(p->next, 1); kfree(p); p = n; } while (p); } static int ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) { struct ip_tunnel_prl_entry *x; struct ip_tunnel_prl_entry __rcu **p; int err = 0; ASSERT_RTNL(); if (a && a->addr != htonl(INADDR_ANY)) { for (p = &t->prl; (x = rtnl_dereference(*p)) != NULL; p = &x->next) { if (x->addr == a->addr) { *p = x->next; kfree_rcu(x, rcu_head); t->prl_count--; goto out; } } err = -ENXIO; } else { x = rtnl_dereference(t->prl); if (x) { t->prl_count = 0; call_rcu(&x->rcu_head, prl_list_destroy_rcu); t->prl = NULL; } } out: return err; } static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ip_tunnel_prl __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl prl; int err; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; if (copy_from_user(&prl, data, sizeof(prl))) return -EFAULT; switch (cmd) { case SIOCDELPRL: err = ipip6_tunnel_del_prl(t, &prl); break; case SIOCADDPRL: case SIOCCHGPRL: err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); return err; } static int isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { struct ip_tunnel_prl_entry *p; int ok = 1; rcu_read_lock(); p = __ipip6_tunnel_locate_prl(t, iph->saddr); if (p) { if (p->flags & PRL_DEFAULT) skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT; else skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; } else { const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; if (ipv6_addr_is_isatap(addr6) && (addr6->s6_addr32[3] == iph->saddr) && ipv6_chk_prefix(addr6, t->dev)) skb->ndisc_nodetype = NDISC_NODETYPE_HOST; else ok = 0; } rcu_read_unlock(); return ok; } static void ipip6_tunnel_uninit(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct sit_net *sitn = net_generic(tunnel->net, sit_net_id); if (dev == sitn->fb_tunnel_dev) { RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL); } else { ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } dst_cache_reset(&tunnel->dst_cache); netdev_put(dev, &tunnel->dev_tracker); } static int ipip6_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; unsigned int data_len = 0; struct ip_tunnel *t; int sifindex; int err; switch (type) { default: case ICMP_PARAMETERPROB: return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; } err = -ENOENT; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->daddr, iph->saddr, sifindex); if (!t) goto out; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, t->parms.link, iph->protocol); err = 0; goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, iph->protocol); err = 0; goto out; } err = 0; if (__in6_dev_get(skb->dev) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) goto out; if (t->parms.iph.daddr == 0) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, const struct in6_addr *v6addr) { __be32 v4embed = 0; if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed) return true; return false; } /* Checks if an address matches an address on the tunnel interface. * Used to detect the NAT of proto 41 packets and let them pass spoofing test. * Long story: * This function is called after we considered the packet as spoofed * in is_spoofed_6rd. * We may have a router that is doing NAT for proto 41 packets * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd * function will return true, dropping the packet. * But, we can still check if is spoofed against the IP * addresses associated with the interface. */ static bool only_dnatted(const struct ip_tunnel *tunnel, const struct in6_addr *v6dst) { int prefix_len; #ifdef CONFIG_IPV6_SIT_6RD prefix_len = tunnel->ip6rd.prefixlen + 32 - tunnel->ip6rd.relay_prefixlen; #else prefix_len = 48; #endif return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); } /* Returns true if a packet is spoofed */ static bool packet_is_spoofed(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *tunnel) { const struct ipv6hdr *ipv6h; if (tunnel->dev->priv_flags & IFF_ISATAP) { if (!isatap_chksrc(skb, iph, tunnel)) return true; return false; } if (tunnel->dev->flags & IFF_POINTOPOINT) return false; ipv6h = ipv6_hdr(skb); if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", &iph->saddr, &ipv6h->saddr, &iph->daddr, &ipv6h->daddr); return true; } if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) return false; if (only_dnatted(tunnel, &ipv6h->daddr)) return false; net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", &iph->saddr, &ipv6h->saddr, &iph->daddr, &ipv6h->daddr); return true; } static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; int sifindex; int err; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr, sifindex); if (tunnel) { if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && tunnel->parms.iph.protocol != 0) goto out; skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->dev = tunnel->dev; if (packet_is_spoofed(skb, iph, tunnel)) { DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), !net_eq(tunnel->net, dev_net(tunnel->dev)))) goto out; /* skb can be uncloned in iptunnel_pull_header, so * old iph is no longer valid */ iph = (const struct iphdr *)skb_mac_header(skb); skb_reset_mac_header(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); netif_rx(skb); return 0; } /* no tunnel matched, let upstream know, ipsec may handle it */ return 1; out: kfree_skb(skb); return 0; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; int sifindex; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; iph = ip_hdr(skb); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr, sifindex); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return 1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return sit_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return sit_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. */ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst) { #ifdef CONFIG_IPV6_SIT_6RD if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, tunnel->ip6rd.prefixlen)) { unsigned int pbw0, pbi0; int pbi1; u32 d; pbw0 = tunnel->ip6rd.prefixlen >> 5; pbi0 = tunnel->ip6rd.prefixlen & 0x1f; d = tunnel->ip6rd.relay_prefixlen < 32 ? (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> tunnel->ip6rd.relay_prefixlen : 0; pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; if (pbi1 > 0) d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> (32 - pbi1); *v4dst = tunnel->ip6rd.relay_prefix | htonl(d); return true; } #else if (v6dst->s6_addr16[0] == htons(0x2002)) { /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ memcpy(v4dst, &v6dst->s6_addr16[1], 4); return true; } #endif return false; } static inline __be32 try_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst) { __be32 dst = 0; check_6rd(tunnel, v6dst, &dst); return dst; } /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; const struct in6_addr *addr6; int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); if (tos == 1) tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ if (dev->priv_flags & IFF_ISATAP) { struct neighbour *neigh = NULL; bool do_tx_error = false; if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if ((addr_type & IPV6_ADDR_UNICAST) && ipv6_addr_is_isatap(addr6)) dst = addr6->s6_addr32[3]; else do_tx_error = true; neigh_release(neigh); if (do_tx_error) goto tx_error; } if (!dst) dst = try_6rd(tunnel, &iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; bool do_tx_error = false; if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) != 0) dst = addr6->s6_addr32[3]; else do_tx_error = true; neigh_release(neigh); if (do_tx_error) goto tx_error; } flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_flow(tunnel->net, &fl4, NULL); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) { ip_rt_put(rt); goto tx_error; } if (df) { mtu = dst_mtu(&rt->dst) - t_hlen; if (mtu < IPV4_MIN_MTU) { DEV_STATS_INC(dev, collisions); ip_rt_put(rt); goto tx_error; } if (mtu < IPV6_MIN_MTU) { mtu = IPV6_MIN_MTU; df = 0; } if (tunnel->parms.iph.daddr) skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; iph6 = ipv6_hdr(skb); } ttl = tiph->ttl; if (ttl == 0) ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) { ip_rt_put(rt); goto tx_error; } skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, struct net_device *dev, u8 ipproto) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); break; case htons(ETH_P_IPV6): ipip6_tunnel_xmit(skb, dev); break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS); break; #endif default: goto tx_err; } return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static void ipip6_tunnel_bind_dev(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); struct net_device *tdev = NULL; int hlen = LL_MAX_HEADER; const struct iphdr *iph; struct flowi4 fl4; iph = &tunnel->parms.iph; if (iph->daddr) { struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4, NULL, iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, RT_TOS(iph->tos), tunnel->parms.link); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev && !netif_is_l3_master(tdev)) { int mtu; mtu = tdev->mtu - t_hlen; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); hlen = tdev->hard_header_len + tdev->needed_headroom; } dev->needed_headroom = t_hlen + hlen; } static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, __u32 fwmark) { struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); ipip6_tunnel_unlink(sitn, t); synchronize_net(); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; __dev_addr_set(t->dev, &p->iph.saddr, 4); memcpy(t->dev->broadcast, &p->iph.daddr, 4); ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; t->fwmark = fwmark; ipip6_tunnel_bind_dev(t->dev); } dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } #ifdef CONFIG_IPV6_SIT_6RD static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, struct ip_tunnel_6rd *ip6rd) { struct in6_addr prefix; __be32 relay_prefix; if (ip6rd->relay_prefixlen > 32 || ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64) return -EINVAL; ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen); if (!ipv6_addr_equal(&prefix, &ip6rd->prefix)) return -EINVAL; if (ip6rd->relay_prefixlen) relay_prefix = ip6rd->relay_prefix & htonl(0xffffffffUL << (32 - ip6rd->relay_prefixlen)); else relay_prefix = 0; if (relay_prefix != ip6rd->relay_prefix) return -EINVAL; t->ip6rd.prefix = prefix; t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } if (!t) t = netdev_priv(dev); ip6rd.prefix = t->ip6rd.prefix; ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) return -EFAULT; return 0; } static int ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; int err; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) return -EFAULT; if (cmd != SIOCDEL6RD) { err = ipip6_tunnel_update_6rd(t, &ip6rd); if (err < 0) return err; } else ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev)); return 0; } #endif /* CONFIG_IPV6_SIT_6RD */ static bool ipip6_valid_ip_proto(u8 ipproto) { return ipproto == IPPROTO_IPV6 || ipproto == IPPROTO_IPIP || #if IS_ENABLED(CONFIG_MPLS) ipproto == IPPROTO_MPLS || #endif ipproto == 0; } static int __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!ipip6_valid_ip_proto(p->iph.protocol)) return -EINVAL; if (p->iph.version != 4 || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); return 0; } static int ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) t = ipip6_tunnel_locate(t->net, p, 0); if (!t) t = netdev_priv(dev); memcpy(p, &t->parms, sizeof(*p)); return 0; } static int ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); int err; err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; t = ipip6_tunnel_locate(t->net, p, 1); if (!t) return -ENOBUFS; return 0; } static int ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); int err; err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; t = ipip6_tunnel_locate(t->net, p, 0); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (!t) return -ENOENT; } else { if (t) { if (t->dev != dev) return -EEXIST; } else { if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr)) return -EINVAL; t = netdev_priv(dev); } ipip6_tunnel_update(t, p, t->fwmark); } return 0; } static int ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { t = ipip6_tunnel_locate(t->net, p, 0); if (!t) return -ENOENT; if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) return -EPERM; dev = t->dev; } unregister_netdevice(dev); return 0; } static int ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { switch (cmd) { case SIOCGETTUNNEL: return ipip6_tunnel_get(dev, p); case SIOCADDTUNNEL: return ipip6_tunnel_add(dev, p); case SIOCCHGTUNNEL: return ipip6_tunnel_change(dev, p); case SIOCDELTUNNEL: return ipip6_tunnel_del(dev, p); default: return -EINVAL; } } static int ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { switch (cmd) { case SIOCGETTUNNEL: case SIOCADDTUNNEL: case SIOCCHGTUNNEL: case SIOCDELTUNNEL: return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); case SIOCGETPRL: return ipip6_tunnel_get_prl(dev, data); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: return ipip6_tunnel_prl_ctl(dev, data, cmd); #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: return ipip6_tunnel_get6rd(dev, data); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: return ipip6_tunnel_6rdctl(dev, data, cmd); #endif default: return -EINVAL; } } static const struct net_device_ops ipip6_netdev_ops = { .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); } #define SIT_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip6_tunnel_setup(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->netdev_ops = &ipip6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; dev->features |= NETIF_F_LLTX; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; } static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int err; tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) { free_percpu(dev->tstats); dev->tstats = NULL; return err; } netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); return 0; } static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); iph->version = 4; iph->protocol = IPPROTO_IPV6; iph->ihl = 5; iph->ttl = 64; rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); } static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (!ipip6_valid_ip_proto(proto)) return -EINVAL; return 0; } static void ipip6_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPV6; parms->iph.ihl = 5; parms->iph.ttl = 64; if (!data) return; ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } #ifdef CONFIG_IPV6_SIT_6RD /* This function returns true when 6RD attributes are present in the nl msg */ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], struct ip_tunnel_6rd *ip6rd) { bool ret = false; memset(ip6rd, 0, sizeof(*ip6rd)); if (!data) return ret; if (data[IFLA_IPTUN_6RD_PREFIX]) { ret = true; ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { ret = true; ip6rd->relay_prefix = nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]); } if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { ret = true; ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) { ret = true; ip6rd->relay_prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); } return ret; } #endif static int ipip6_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif int err; nt = netdev_priv(dev); if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); if (err < 0) return err; } ipip6_netlink_parms(data, &nt->parms, &nt->fwmark); if (ipip6_tunnel_locate(net, &nt->parms, 0)) return -EEXIST; err = ipip6_tunnel_create(dev); if (err < 0) return err; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu >= IPV6_MIN_MTU && mtu <= IP6_MAX_MTU - dev->hard_header_len) dev->mtu = mtu; } #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); if (err < 0) unregister_netdevice_queue(dev, NULL); } #endif return err; } static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif __u32 fwmark = t->fwmark; int err; if (dev == sitn->fb_tunnel_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip6_netlink_parms(data, &p, &fwmark); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; t = ipip6_tunnel_locate(net, &p, 0); if (t) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ipip6_tunnel_update(t, &p, fwmark); #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) return ipip6_tunnel_update_6rd(t, &ip6rd); #endif return 0; } static size_t ipip6_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + #ifdef CONFIG_IPV6_SIT_6RD /* IFLA_IPTUN_6RD_PREFIX */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_6RD_RELAY_PREFIX */ nla_total_size(4) + /* IFLA_IPTUN_6RD_PREFIXLEN */ nla_total_size(2) + /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ nla_total_size(2) + #endif /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; #ifdef CONFIG_IPV6_SIT_6RD if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX, &tunnel->ip6rd.prefix) || nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, tunnel->ip6rd.relay_prefix) || nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, tunnel->ip6rd.prefixlen) || nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, tunnel->ip6rd.relay_prefixlen)) goto nla_put_failure; #endif if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, #ifdef CONFIG_IPV6_SIT_6RD [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 }, [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, #endif [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static void ipip6_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); if (dev != sitn->fb_tunnel_dev) unregister_netdevice_queue(dev, head); } static struct rtnl_link_ops sit_link_ops __read_mostly = { .kind = "sit", .maxtype = IFLA_IPTUN_MAX, .policy = ipip6_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip6_tunnel_setup, .validate = ipip6_validate, .newlink = ipip6_newlink, .changelink = ipip6_changelink, .get_size = ipip6_get_size, .fill_info = ipip6_fill_info, .dellink = ipip6_dellink, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip6_err, .priority = 2, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip6_err, .priority = 2, }; #endif static void __net_exit sit_destroy_tunnels(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; int prio; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &sit_link_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); t = rtnl_dereference(t->next); } } } } static int __net_init sit_init_net(struct net *net) { struct sit_net *sitn = net_generic(net, sit_net_id); struct ip_tunnel *t; int err; sitn->tunnels[0] = sitn->tunnels_wc; sitn->tunnels[1] = sitn->tunnels_l; sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; if (!net_has_fallback_tunnels(net)) return 0; sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; } dev_net_set(sitn->fb_tunnel_dev, net); sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; err = register_netdev(sitn->fb_tunnel_dev); if (err) goto err_reg_dev; ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); t = netdev_priv(sitn->fb_tunnel_dev); strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err; } static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) sit_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; static void __exit sit_cleanup(void) { rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm4_tunnel_deregister(&ipip_handler, AF_INET); #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); #endif unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } static int __init sit_init(void) { int err; pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&sit_handler, AF_INET6); if (err < 0) { pr_info("%s: can't register ip6ip4\n", __func__); goto xfrm_tunnel_failed; } err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register ip4ip4\n", __func__); goto xfrm_tunnel4_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register mplsip\n", __func__); goto xfrm_tunnel_mpls_failed; } #endif err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mpls_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm_tunnel_failed: unregister_pernet_device(&sit_net_ops); goto out; } module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0");
224 6464 1184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM maple_tree #if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MM_H #include <linux/tracepoint.h> struct ma_state; TRACE_EVENT(ma_op, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_read, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_write, TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv, void *val), TP_ARGS(fn, mas, piv, val), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(unsigned long, piv) __field(void *, val) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->piv = piv; __entry->val = val; __entry->node = mas->node; ), TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last, (unsigned long) __entry->piv, (void *) __entry->val ) ) #endif /* _TRACE_MM_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
10 10 10 7 3 10 10 6 4 4 16 2 1 1 7 7 7 7 7 6 1 11 2 2 1 606 168 20 360 356 358 3 602 356 2 2 17 16 1 601 2 10 1 8 7 8 66 66 26 26 53 268 188 188 2 65 1 64 358 357 578 17 17 35 1099 1100 1037 515 670 200 4 606 568 1053 742 742 741 1099 97 1045 39 10 10 10 1 9 10 10 20 889 20 720 742 2072 2072 2073 858 858 858 20 20 20 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strlcpy the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
3 3 3 12 11 12 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "bat_iv_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bitarray.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); /** * enum batadv_dup_status - duplicate status */ enum batadv_dup_status { /** @BATADV_NO_DUP: the packet is no duplicate */ BATADV_NO_DUP = 0, /** * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for * the neighbor) */ BATADV_ORIG_DUP, /** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */ BATADV_NEIGH_DUP, /** * @BATADV_PROTECTED: originator is currently protected (after reboot) */ BATADV_PROTECTED, }; /** * batadv_ring_buffer_set() - update the ring buffer with the given value * @lq_recv: pointer to the ring buffer * @lq_index: index to store the value at * @value: value to store in the ring buffer */ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) { lq_recv[*lq_index] = value; *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; } /** * batadv_ring_buffer_avg() - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { const u8 *ptr; u16 count = 0; u16 i = 0; u16 sum = 0; ptr = lq_recv; while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) { if (*ptr != 0) { count++; sum += *ptr; } i++; ptr++; } if (count == 0) return 0; return (u8)(sum / count); } /** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) goto free_orig_node_hash; return orig_node; free_orig_node_hash: /* reference for batadv_hash_add */ batadv_orig_node_put(orig_node); /* reference from batadv_orig_node_new */ batadv_orig_node_put(orig_node); return NULL; } static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; neigh_node->orig_node = orig_neigh; out: return neigh_node; } static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; u32 random_seqno; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) { mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; } hard_iface->bat_iv.ogm_buff = ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; batadv_ogm_packet->packet_type = BATADV_IV_OGM; batadv_ogm_packet->version = BATADV_COMPAT_VERSION; batadv_ogm_packet->ttl = 2; batadv_ogm_packet->flags = BATADV_NO_FLAGS; batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; batadv_ogm_packet->ttl = BATADV_TTL; unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ static unsigned long batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) { unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) { int hop_penalty = atomic_read(&bat_priv->hop_penalty); int new_tq; new_tq = tq * (BATADV_TQ_MAX_VALUE - hop_penalty); new_tq /= BATADV_TQ_MAX_VALUE; return new_tq; } /** * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm_packet: potential OGM in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm_packet *ogm_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); } /* send a batman ogm to a given interface */ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); const char *fwd_str; u8 packet_num; s16 buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; if (hard_iface->if_status != BATADV_IF_ACTIVE) return; packet_num = 0; buff_pos = 0; packet_pos = forw_packet->skb->data; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, batadv_ogm_packet)) { /* we might have aggregated direct link packets with an * ordinary base packet */ if (forw_packet->direct_link_flags & BIT(packet_num) && forw_packet->if_incoming == hard_iface) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; if (packet_num > 0 || !forw_packet->own) fwd_str = "Forwarding"; else fwd_str = "Sending own"; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s %spacket (originator %pM, seqno %u, TQ %d, TTL %d, IDF %s) on interface %s [%pM]\n", fwd_str, (packet_num > 0 ? "aggregated " : ""), batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ? "on" : "off"), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); buff_pos += BATADV_OGM_HLEN; buff_pos += ntohs(batadv_ogm_packet->tvlv_len); packet_num++; packet_pos = forw_packet->skb->data + buff_pos; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; } /* create clone because function is called more than once */ skb = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb) { batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } } /* send a batman ogm packet */ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { struct net_device *soft_iface; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); return; } soft_iface = forw_packet->if_incoming->soft_iface; if (WARN_ON(!forw_packet->if_outgoing)) return; if (forw_packet->if_outgoing->soft_iface != soft_iface) { pr_warn("%s: soft interface switch for queued OGM\n", __func__); return; } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) return; /* only for one specific outgoing interface */ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); } /** * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an * existing forward packet * @new_bat_ogm_packet: OGM packet to be aggregated * @bat_priv: the bat priv with all the soft interface information * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @directlink: true if this is a direct link packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, struct batadv_priv *bat_priv, int packet_len, unsigned long send_time, bool directlink, const struct batadv_hard_iface *if_incoming, const struct batadv_hard_iface *if_outgoing, const struct batadv_forw_packet *forw_packet) { struct batadv_ogm_packet *batadv_ogm_packet; int aggregated_bytes = forw_packet->packet_len + packet_len; struct batadv_hard_iface *primary_if = NULL; bool res = false; unsigned long aggregation_end_time; batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data; aggregation_end_time = send_time; aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* we can aggregate the current packet to this aggregated packet * if: * * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet won't be bigger than * MAX_AGGREGATION_BYTES * otherwise aggregation is not possible */ if (!time_before(send_time, forw_packet->send_time) || !time_after_eq(aggregation_end_time, forw_packet->send_time)) return false; if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) return false; /* packet is not leaving on the same interface. */ if (forw_packet->if_outgoing != if_outgoing) return false; /* check aggregation compatibility * -> direct link packets are broadcasted on * their interface only * -> aggregate packet if the current packet is * a "global" packet as well as the base * packet */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return false; /* packets without direct link flag and high TTL * are flooded through the net */ if (!directlink && !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && batadv_ogm_packet->ttl != 1 && /* own packets originating non-primary * interfaces leave only that interface */ (!forw_packet->own || forw_packet->if_incoming == primary_if)) { res = true; goto out; } /* if the incoming packet is sent via this one * interface only - we still can aggregate */ if (directlink && new_bat_ogm_packet->ttl == 1 && forw_packet->if_incoming == if_incoming && /* packets from direct neighbors or * own secondary interface packets * (= secondary interface packets in general) */ (batadv_ogm_packet->flags & BATADV_DIRECTLINK || (forw_packet->own && forw_packet->if_incoming != primary_if))) { res = true; goto out; } out: batadv_hardif_put(primary_if); return res; } /** * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this * packet to it. * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @direct_link: whether this OGM has direct link status * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm */ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_forw_packet *forw_packet_aggr; struct sk_buff *skb; unsigned char *skb_buff; unsigned int skb_size; atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; if (atomic_read(&bat_priv->aggregated_ogms) && packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; else skb_size = packet_len; skb_size += ETH_HLEN; skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!skb) return; forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, queue_left, bat_priv, skb); if (!forw_packet_aggr) { kfree_skb(skb); return; } forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); skb_buff = skb_put(forw_packet_aggr->skb, packet_len); forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); forw_packet_aggr->own = own_packet; forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; forw_packet_aggr->send_time = send_time; /* save packet direct link flag status */ if (direct_link) forw_packet_aggr->direct_link_flags |= 1; INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, batadv_iv_send_outstanding_bat_ogm_packet); batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time); } /* aggregate a new packet into the existing ogm packet */ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, const unsigned char *packet_buff, int packet_len, bool direct_link) { unsigned long new_direct_link_flag; skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len); forw_packet_aggr->packet_len += packet_len; forw_packet_aggr->num_packets++; /* save packet direct link flag status */ if (direct_link) { new_direct_link_flag = BIT(forw_packet_aggr->num_packets); forw_packet_aggr->direct_link_flags |= new_direct_link_flag; } } /** * batadv_iv_ogm_queue_add() - queue up an OGM for transmission * @bat_priv: the bat priv with all the soft interface information * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm * @send_time: timestamp (jiffies) when the packet is to be sent */ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet, unsigned long send_time) { /* _aggr -> pointer to the packet we want to aggregate with * _pos -> pointer to the position in the queue */ struct batadv_forw_packet *forw_packet_aggr = NULL; struct batadv_forw_packet *forw_packet_pos = NULL; struct batadv_ogm_packet *batadv_ogm_packet; bool direct_link; unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, bat_priv, packet_len, send_time, direct_link, if_incoming, if_outgoing, forw_packet_pos)) { forw_packet_aggr = forw_packet_pos; break; } } } /* nothing to aggregate with - either aggregation disabled or no * suitable aggregation packet found */ if (!forw_packet_aggr) { /* the following section can run without the lock */ spin_unlock_bh(&bat_priv->forw_bat_list_lock); /* if we could not aggregate this packet with one of the others * we hold it back for a while, so that it might be aggregated * later on */ if (!own_packet && atomic_read(&bat_priv->aggregated_ogms)) send_time += max_aggregation_jiffies; batadv_iv_ogm_aggregate_new(packet_buff, packet_len, send_time, direct_link, if_incoming, if_outgoing, own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); spin_unlock_bh(&bat_priv->forw_bat_list_lock); } } static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, const struct ethhdr *ethhdr, struct batadv_ogm_packet *batadv_ogm_packet, bool is_single_hop_neigh, bool is_from_best_next_hop, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); return; } if (!is_from_best_next_hop) { /* Mark the forwarded packet when it is not coming from our * best next hop. We still need to forward the packet for our * neighbor link quality detection to work in case the packet * originated from a single hop neighbor. Otherwise we can * simply drop the ogm. */ if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_NOT_BEST_NEXT_HOP; else return; } tvlv_len = ntohs(batadv_ogm_packet->tvlv_len); batadv_ogm_packet->ttl--; ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source); /* apply hop penalty */ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq, bat_priv); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: tq: %i, ttl: %i\n", batadv_ogm_packet->tq, batadv_ogm_packet->ttl); if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet, BATADV_OGM_HLEN + tvlv_len, if_incoming, if_outgoing, 0, batadv_iv_ogm_fwd_send_time()); } /** * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows * for the given interface * @hard_iface: the interface for which the windows have to be shifted */ static void batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; unsigned long *word; u32 i; u8 *w; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { if (orig_ifinfo->if_outgoing != hard_iface) continue; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); word = orig_ifinfo->bat_iv.bcast_own; batadv_bit_get_packet(bat_priv, word, 1, 0); w = &orig_ifinfo->bat_iv.bcast_own_sum; *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); } } rcu_read_unlock(); } } /** * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer * @hard_iface: interface whose ogm buffer should be transmitted */ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; u32 seqno; u16 tvlv_len = 0; unsigned long send_time; lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); /* interface already disabled by batadv_iv_ogm_iface_disable */ if (!*ogm_buff) return; /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in * hardif_activate_interface() where the originator mac is set and * outdated packets (especially uninitialized mac addresses) in the * packet queue */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, ogm_buff_len, BATADV_OGM_HLEN); } batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff); batadv_ogm_packet->tvlv_len = htons(tvlv_len); /* change sequence number to network order */ seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno); batadv_ogm_packet->seqno = htonl(seqno); atomic_inc(&hard_iface->bat_iv.ogm_seqno); batadv_iv_ogm_slide_own_bcast_window(hard_iface); send_time = batadv_iv_ogm_emit_send_time(bat_priv); if (hard_iface != primary_if) { /* OGMs from secondary interfaces are only scheduled on their * respective interfaces. */ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, hard_iface, 1, send_time); goto out; } /* OGMs from primary interfaces are scheduled on all * interfaces. */ rcu_read_lock(); list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) continue; if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); batadv_hardif_put(tmp_hard_iface); } rcu_read_unlock(); out: batadv_hardif_put(primary_if); } static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); batadv_iv_ogm_schedule_buff(hard_iface); mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast * * Return: Number of replied (rebroadcasted) OGMs which were transmitted by * an originator and directly (without intermediate hop) received by a specific * interface */ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; u8 sum; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); if (!orig_ifinfo) return 0; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); sum = orig_ifinfo->bat_iv.bcast_own_sum; spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); return sum; } /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig node who originally emitted the ogm packet * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node * @ethhdr: Ethernet header of the OGM * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @dup_status: the duplicate status of this ogm packet. */ static void batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo *orig_ifinfo, const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, enum batadv_dup_status dup_status) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s(): Searching and updating originator entry of received packet\n", __func__); rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } if (dup_status != BATADV_NO_DUP) continue; /* only update the entry for this outgoing interface */ neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node, if_outgoing); if (!neigh_ifinfo) continue; spin_lock_bh(&tmp_neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, 0); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } if (!neigh_node) { struct batadv_orig_node *orig_tmp; orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_tmp) goto unlock; neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, orig_node, orig_tmp); batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; neigh_node->last_seen = jiffies; spin_lock_bh(&neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, batadv_ogm_packet->tq); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&neigh_node->ifinfo_lock); if (dup_status == BATADV_NO_DUP) { orig_ifinfo->last_ttl = batadv_ogm_packet->ttl; neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl; } /* if this neighbor already is our next hop there is nothing * to change */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router == neigh_node) goto out; if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); if (!router_ifinfo) goto out; /* if this neighbor does not offer a better TQ we won't * consider it */ if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg) goto out; } /* if the TQ is the same and the link not more symmetric we * won't consider it either */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, router->if_incoming); sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, neigh_node->if_incoming); if (sum_orig >= sum_neigh) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); goto out; unlock: rcu_read_unlock(); out: batadv_neigh_node_put(neigh_node); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet * @orig_node: the orig node who originally emitted the ogm packet * @orig_neigh_node: the orig node struct of the neighbor who sent the packet * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: true if the link can be considered bidirectional, false otherwise */ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; bool ret = false; /* find corresponding one hop neighbor */ rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_neigh_node->neigh_list, list) { if (!batadv_compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig)) continue; if (tmp_neigh_node->if_incoming != if_incoming) continue; if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; break; } rcu_read_unlock(); if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, orig_neigh_node, orig_neigh_node); if (!neigh_node) goto out; /* if orig_node is direct neighbor update neigh_node last_seen */ if (orig_node == orig_neigh_node) neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) total_count = neigh_rq_count; else total_count = orig_eq_count; /* if we have too few packets (too less data) we set tq_own to zero * if we receive too few packets it is not considered bidirectional */ if (total_count < BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM || neigh_rq_count < BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM) tq_own = 0; else /* neigh_node->real_packet_count is never zero as we * only purge old information when getting new * information */ tq_own = (BATADV_TQ_MAX_VALUE * total_count) / neigh_rq_count; /* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does * affect the nearly-symmetric links only a little, but * punishes asymmetric links more. This will give a value * between 0 and TQ_MAX_VALUE */ neigh_rq_inv = BATADV_TQ_LOCAL_WINDOW_SIZE - neigh_rq_count; neigh_rq_inv_cube = neigh_rq_inv * neigh_rq_inv * neigh_rq_inv; neigh_rq_max_cube = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE; inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_hop_penalty, batadv_ogm_packet->tq, if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality * consider it bidirectional */ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT) ret = true; out: batadv_neigh_node_put(neigh_node); return ret; } /** * batadv_iv_ogm_update_seqnos() - process a batman packet for all interfaces, * adjust the sequence number and find out whether it is a duplicate * @ethhdr: ethernet header of the packet * @batadv_ogm_packet: OGM packet to be considered * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, const struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; bool is_dup; s32 seq_diff; bool need_update = false; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; u32 seqno = ntohl(batadv_ogm_packet->seqno); u8 *neigh_addr; u8 packet_count; unsigned long *bitmap; orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); if (!orig_node) return BATADV_NO_DUP; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { batadv_orig_node_put(orig_node); return 0; } spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); seq_diff = seqno - orig_ifinfo->last_real_seqno; /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_TQ_LOCAL_WINDOW_SIZE, &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } rcu_read_lock(); hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) continue; neigh_addr = neigh_node->addr; is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits, orig_ifinfo->last_real_seqno, seqno); if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && neigh_node->if_incoming == if_incoming) { set_mark = 1; if (is_dup) ret = BATADV_NEIGH_DUP; } else { set_mark = 0; if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } /* if the window moved, set the update flag. */ bitmap = neigh_ifinfo->bat_iv.real_bits; need_update |= batadv_bit_get_packet(bat_priv, bitmap, seq_diff, set_mark); packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); if (need_update) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s updating last_seqno: old %u, new %u\n", if_outgoing ? if_outgoing->net_dev->name : "DEFAULT", orig_ifinfo->last_real_seqno, seqno); orig_ifinfo->last_real_seqno = seqno; } out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_put(orig_node); batadv_orig_ifinfo_put(orig_ifinfo); return ret; } /** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface * @skb: the skb containing the OGM * @ogm_offset: offset from skb->data to start of ogm header * @orig_node: the (cached) orig node for the originator of this OGM * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_ogm_packet *ogm_packet; enum batadv_dup_status dup_status; bool is_from_best_next_hop = false; bool is_single_hop_neigh = false; bool sameseq, similar_ttl; struct sk_buff *skb_priv; struct ethhdr *ethhdr; u8 *prev_sender; bool is_bidirect; /* create a private copy of the skb, as some functions change tq value * and/or flags. */ skb_priv = skb_copy(skb, GFP_ATOMIC); if (!skb_priv) return; ethhdr = eth_hdr(skb_priv); ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset); dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet, if_incoming, if_outgoing); if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig)) is_single_hop_neigh = true; if (dup_status == BATADV_PROTECTED) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within seqno protection time (sender: %pM)\n", ethhdr->h_source); goto out; } if (ogm_packet->tq == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with tq equal 0\n"); goto out; } if (is_single_hop_neigh) { hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (hardif_neigh) hardif_neigh->last_seen = jiffies; } router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { router_router = batadv_orig_router_get(router->orig_node, if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) && (batadv_compare_eth(router->addr, ethhdr->h_source))) is_from_best_next_hop = true; prev_sender = ogm_packet->prev_sender; /* avoid temporary routing loops */ if (router && router_router && (batadv_compare_eth(router->addr, prev_sender)) && !(batadv_compare_eth(ogm_packet->orig, prev_sender)) && (batadv_compare_eth(router->addr, router_router->addr))) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n", ethhdr->h_source); goto out; } if (if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node); /* if sender is a direct neighbor the sender mac equals * originator mac */ if (is_single_hop_neigh) orig_neigh_node = orig_node; else orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; /* Update nc_nodes of the originator */ batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, ogm_packet, is_single_hop_neigh); orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; } is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node, ogm_packet, if_incoming, if_outgoing); /* update ranking if it is not a duplicate or has the same * seqno and similar ttl as the non-duplicate */ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out_neigh; sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, ogm_packet, if_incoming, if_outgoing, dup_status); } batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out_neigh; /* is single hop (direct) neighbor */ if (is_single_hop_neigh) { /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ if (ogm_packet->ttl <= 2 && if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; } /* mark direct link on incoming interface */ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast neighbor packet with direct link flag\n"); goto out_neigh; } /* multihop originator */ if (!is_bidirect) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: not received via bidirectional link\n"); goto out_neigh; } if (dup_status == BATADV_NEIGH_DUP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: duplicate packet received\n"); goto out_neigh; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast originator packet\n"); batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); out_neigh: if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); batadv_neigh_node_put(router_router); batadv_neigh_node_put(orig_neigh_router); batadv_hardif_neigh_put(hardif_neigh); consume_skb(skb_priv); } /** * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it * @ogm_packet: rebroadcast OGM packet to process * @if_incoming: the interface where this packet was received * @orig_node: originator which reproadcasted the OGMs * @if_incoming_seqno: OGM sequence number when rebroadcast was received */ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_orig_node *orig_node, u32 if_incoming_seqno) { struct batadv_orig_ifinfo *orig_ifinfo; s32 bit_pos; u8 *weight; /* neighbor has to indicate direct link and it has to * come via the corresponding interface */ if (!(ogm_packet->flags & BATADV_DIRECTLINK)) return; if (!batadv_compare_eth(if_incoming->net_dev->dev_addr, ogm_packet->orig)) return; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming); if (!orig_ifinfo) return; /* save packet seqno for bidirectional check */ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); bit_pos = if_incoming_seqno - 2; bit_pos -= ntohl(ogm_packet->seqno); batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos); weight = &orig_ifinfo->bat_iv.bcast_own_sum; *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); } /** * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; u32 if_incoming_seqno; bool has_directlink_flag; struct ethhdr *ethhdr; bool is_my_oldorig = false; bool is_my_addr = false; bool is_my_orig = false; ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset); ethhdr = eth_hdr(skb); /* Silently drop when the batman packet is actually not a * correct packet. * * This might happen if a packet is padded (e.g. Ethernet has a * minimum frame length of 64 byte) and the aggregation interprets * it as an additional length. * * TODO: A more sane solution would be to have a bit in the * batadv_ogm_packet to detect whether the packet is the last * packet in an aggregation. Here we expect that the padding * is always zero (or not 0x01) */ if (ogm_packet->packet_type != BATADV_IV_OGM) return; /* could be changed by schedule_own_packet() */ if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno); if (ogm_packet->flags & BATADV_DIRECTLINK) has_directlink_flag = true; else has_directlink_flag = false; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ogm_packet->prev_sender, ntohl(ogm_packet->seqno), ogm_packet->tq, ogm_packet->ttl, ogm_packet->version, has_directlink_flag); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != if_incoming->soft_iface) continue; if (batadv_compare_eth(ethhdr->h_source, hard_iface->net_dev->dev_addr)) is_my_addr = true; if (batadv_compare_eth(ogm_packet->orig, hard_iface->net_dev->dev_addr)) is_my_orig = true; if (batadv_compare_eth(ogm_packet->prev_sender, hard_iface->net_dev->dev_addr)) is_my_oldorig = true; } rcu_read_unlock(); if (is_my_addr) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: received my own broadcast (sender: %pM)\n", ethhdr->h_source); return; } if (is_my_orig) { orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) return; batadv_iv_ogm_process_reply(ogm_packet, if_incoming, orig_neigh_node, if_incoming_seqno); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); batadv_orig_node_put(orig_neigh_node); return; } if (is_my_oldorig) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n", ethhdr->h_source); return; } if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n", ethhdr->h_source); return; } orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) return; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); batadv_orig_node_put(orig_node); } static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_forw_packet *forw_packet; struct batadv_priv *bat_priv; bool dropped = false; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; goto out; } batadv_iv_ogm_emit(forw_packet); /* we have to have at least one packet in the queue to determine the * queues wake up time unless we are shutting down. * * only re-schedule if this is the "original" copy, e.g. the OGM of the * primary interface should only be rescheduled once per period, but * this function will be called for the forw_packet instances of the * other secondary interfaces as well. */ if (forw_packet->own && forw_packet->if_incoming == forw_packet->if_outgoing) batadv_iv_ogm_schedule(forw_packet->if_incoming); out: /* do we get something for free()? */ if (batadv_forw_packet_steal(forw_packet, &bat_priv->forw_bat_list_lock)) batadv_forw_packet_free(forw_packet, dropped); } static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm_packet *ogm_packet; u8 *packet_pos; int ogm_offset; bool res; int ret = NET_RX_DROP; res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN); if (!res) goto free_skb; /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm_packet *)skb->data; /* unpack the aggregated packets and process them one by one */ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_iv_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a * given outgoing interface. * @neigh_node: Neighbour of interest * @if_outgoing: Outgoing interface of interest * @tq_avg: Pointer of where to store the TQ average * * Return: False if no average TQ available, otherwise true. */ static bool batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_outgoing, u8 *tq_avg) { struct batadv_neigh_ifinfo *n_ifinfo; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return false; *tq_avg = n_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(n_ifinfo); return true; } /** * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { void *hdr; u8 tq_avg; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) return 0; if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; u8 tq_avg_best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, &tq_avg_best)) goto out; if (tq_avg_best == 0) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_iv_ogm_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_iv_ogm_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } /** * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * @diff: pointer to integer receiving the calculated difference * * The content of *@diff is only valid when this function returns true. * It is less, equal to or greater than 0 if the metric via neigh1 is lower, * the same as or higher than the metric via neigh2 * * Return: true when the difference could be calculated, false otherwise */ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2, int *diff) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; u8 tq1, tq2; bool ret = true; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!neigh1_ifinfo || !neigh2_ifinfo) { ret = false; goto out; } tq1 = neigh1_ifinfo->bat_iv.tq_avg; tq2 = neigh2_ifinfo->bat_iv.tq_avg; *diff = (int)tq1 - (int)tq2; out: batadv_neigh_ifinfo_put(neigh1_ifinfo); batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } /** * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface * into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: Hard interface to dump the neighbours for * @idx_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return 0; return diff; } /** * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better * than neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return false; ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD; return ret; } static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); } /** * batadv_iv_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) { /* set default TQ difference threshold to 20 */ atomic_set(&bat_priv->gw.sel_class, 20); } static struct batadv_gw_node * batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; u64 max_gw_factor = 0; u64 tmp_gw_factor = 0; u8 max_tq = 0; u8 tq_avg; struct batadv_orig_node *orig_node; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) continue; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto next; if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; switch (atomic_read(&bat_priv->gw.sel_class)) { case 1: /* fast connection */ tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; if (tmp_gw_factor > max_gw_factor || (tmp_gw_factor == max_gw_factor && tq_avg > max_tq)) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; default: /* 2: stable connection (use best statistic) * 3: fast-switch (use best statistic but change as * soon as a better gateway appears) * XX: late-switch (use best statistic but change as * soon as a better gateway appears which has * $routing_class more tq points) */ if (tq_avg > max_tq) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; } if (tq_avg > max_tq) max_tq = tq_avg; if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; batadv_gw_node_put(gw_node); next: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); return curr_gw; } static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL; struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL; struct batadv_neigh_node *router_gw = NULL; struct batadv_neigh_node *router_orig = NULL; u8 gw_tq_avg, orig_tq_avg; bool ret = false; /* dynamic re-election is performed only on fast or late switch */ if (atomic_read(&bat_priv->gw.sel_class) <= 2) return false; router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); if (!router_gw) { ret = true; goto out; } router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw, BATADV_IF_DEFAULT); if (!router_gw_ifinfo) { ret = true; goto out; } router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router_orig) goto out; router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig, BATADV_IF_DEFAULT); if (!router_orig_ifinfo) goto out; gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg; orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg; /* the TQ value has to be better */ if (orig_tq_avg < gw_tq_avg) goto out; /* if the routing class is greater than 3 the value tells us how much * greater the TQ value of the new gateway must be */ if ((atomic_read(&bat_priv->gw.sel_class) > 3) && (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", gw_tq_avg, orig_tq_avg); ret = true; out: batadv_neigh_ifinfo_put(router_gw_ifinfo); batadv_neigh_ifinfo_put(router_orig_ifinfo); batadv_neigh_node_put(router_gw); batadv_neigh_node_put(router_orig); return ret; } /** * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) || nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_iv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { .enable = batadv_iv_ogm_iface_enable, .enabled = batadv_iv_iface_enabled, .disable = batadv_iv_ogm_iface_disable, .update_mac = batadv_iv_ogm_iface_update_mac, .primary_set = batadv_iv_ogm_primary_iface_set, }, .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .dump = batadv_iv_ogm_neigh_dump, }, .orig = { .dump = batadv_iv_ogm_orig_dump, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, .sel_class_max = BATADV_TQ_MAX_VALUE, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, .dump = batadv_iv_gw_dump, }, }; /** * batadv_iv_init() - B.A.T.M.A.N. IV initialization function * * Return: 0 on success or negative error number in case of failure */ int __init batadv_iv_init(void) { int ret; /* batman originator packet */ ret = batadv_recv_handler_register(BATADV_IV_OGM, batadv_iv_ogm_receive); if (ret < 0) goto out; ret = batadv_algo_register(&batadv_batman_iv); if (ret < 0) goto handler_unregister; goto out; handler_unregister: batadv_recv_handler_unregister(BATADV_IV_OGM); out: return ret; }
101 101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kzalloc(3 * sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
2 7531 5092 2269 3014 1271 37 1 37 880 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timer #if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMER_H #include <linux/tracepoint.h> #include <linux/hrtimer.h> #include <linux/timer.h> DECLARE_EVENT_CLASS(timer_class, TP_PROTO(struct timer_list *timer), TP_ARGS(timer), TP_STRUCT__entry( __field( void *, timer ) ), TP_fast_assign( __entry->timer = timer; ), TP_printk("timer=%p", __entry->timer) ); /** * timer_init - called when the timer is initialized * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_init, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_timer_flags(flags) \ __print_flags(flags, "|", \ { TIMER_MIGRATING, "M" }, \ { TIMER_DEFERRABLE, "D" }, \ { TIMER_PINNED, "P" }, \ { TIMER_IRQSAFE, "I" }) /** * timer_start - called when the timer is started * @timer: pointer to struct timer_list * @expires: the timers expiry time * @flags: the timers flags */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, unsigned long expires, unsigned int flags), TP_ARGS(timer, expires, flags), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, now ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; __entry->expires = expires; __entry->now = jiffies; __entry->flags = flags; ), TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); /** * timer_expire_entry - called immediately before the timer callback * @timer: pointer to struct timer_list * @baseclk: value of timer_base::clk when timer expires * * Allows to determine the timer latency. */ TRACE_EVENT(timer_expire_entry, TP_PROTO(struct timer_list *timer, unsigned long baseclk), TP_ARGS(timer, baseclk), TP_STRUCT__entry( __field( void *, timer ) __field( unsigned long, now ) __field( void *, function) __field( unsigned long, baseclk ) ), TP_fast_assign( __entry->timer = timer; __entry->now = jiffies; __entry->function = timer->function; __entry->baseclk = baseclk; ), TP_printk("timer=%p function=%ps now=%lu baseclk=%lu", __entry->timer, __entry->function, __entry->now, __entry->baseclk) ); /** * timer_expire_exit - called immediately after the timer callback returns * @timer: pointer to struct timer_list * * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); /** * timer_cancel - called when the timer is canceled * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_cancel, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ { CLOCK_MONOTONIC, "CLOCK_MONOTONIC" }, \ { CLOCK_BOOTTIME, "CLOCK_BOOTTIME" }, \ { CLOCK_TAI, "CLOCK_TAI" }) #define decode_hrtimer_mode(mode) \ __print_symbolic(mode, \ { HRTIMER_MODE_ABS, "ABS" }, \ { HRTIMER_MODE_REL, "REL" }, \ { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \ { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \ { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \ { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \ { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** * hrtimer_init - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_init, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), TP_ARGS(hrtimer, clockid, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( clockid_t, clockid ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->clockid = clockid; __entry->mode = mode; ), TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer, decode_clockid(__entry->clockid), decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_start - called when the hrtimer is started * @hrtimer: pointer to struct hrtimer * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_start, TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), TP_ARGS(hrtimer, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( void *, function ) __field( s64, expires ) __field( s64, softexpires ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->function = hrtimer->function; __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; ), TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " "mode=%s", __entry->hrtimer, __entry->function, (unsigned long long) __entry->expires, (unsigned long long) __entry->softexpires, decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_expire_entry - called immediately before the hrtimer callback * @hrtimer: pointer to struct hrtimer * @now: pointer to variable which contains current time of the * timers base. * * Allows to determine the timer latency. */ TRACE_EVENT(hrtimer_expire_entry, TP_PROTO(struct hrtimer *hrtimer, ktime_t *now), TP_ARGS(hrtimer, now), TP_STRUCT__entry( __field( void *, hrtimer ) __field( s64, now ) __field( void *, function) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; __entry->function = hrtimer->function; ), TP_printk("hrtimer=%p function=%ps now=%llu", __entry->hrtimer, __entry->function, (unsigned long long) __entry->now) ); DECLARE_EVENT_CLASS(hrtimer_class, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer), TP_STRUCT__entry( __field( void *, hrtimer ) ), TP_fast_assign( __entry->hrtimer = hrtimer; ), TP_printk("hrtimer=%p", __entry->hrtimer) ); /** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer * * When used in combination with the hrtimer_expire_entry tracepoint we can * determine the runtime of the callback function. */ DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * hrtimer_cancel - called when the hrtimer is canceled * @hrtimer: pointer to struct hrtimer */ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * itimer_state - called when itimer is started or canceled * @which: name of the interval timer * @value: the itimers value, itimer is canceled if value->it_value is * zero, otherwise it is started * @expires: the itimers expiry time */ TRACE_EVENT(itimer_state, TP_PROTO(int which, const struct itimerspec64 *const value, unsigned long long expires), TP_ARGS(which, value, expires), TP_STRUCT__entry( __field( int, which ) __field( unsigned long long, expires ) __field( long, value_sec ) __field( long, value_nsec ) __field( long, interval_sec ) __field( long, interval_nsec ) ), TP_fast_assign( __entry->which = which; __entry->expires = expires; __entry->value_sec = value->it_value.tv_sec; __entry->value_nsec = value->it_value.tv_nsec; __entry->interval_sec = value->it_interval.tv_sec; __entry->interval_nsec = value->it_interval.tv_nsec; ), TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld", __entry->which, __entry->expires, __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC, __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC) ); /** * itimer_expire - called when itimer expires * @which: type of the interval timer * @pid: pid of the process which owns the timer * @now: current time, used to calculate the latency of itimer */ TRACE_EVENT(itimer_expire, TP_PROTO(int which, struct pid *pid, unsigned long long now), TP_ARGS(which, pid, now), TP_STRUCT__entry( __field( int , which ) __field( pid_t, pid ) __field( unsigned long long, now ) ), TP_fast_assign( __entry->which = which; __entry->now = now; __entry->pid = pid_nr(pid); ), TP_printk("which=%d pid=%d now=%llu", __entry->which, (int) __entry->pid, __entry->now) ); #ifdef CONFIG_NO_HZ_COMMON #define TICK_DEP_NAMES \ tick_dep_mask_name(NONE) \ tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name(CLOCK_UNSTABLE) \ tick_dep_name(RCU) \ tick_dep_name_end(RCU_EXP) #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end /* The MASK will convert to their bits and they need to be processed too */ #define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); #define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); /* NONE only has a mask defined for it */ #define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); TICK_DEP_NAMES #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } #define show_tick_dep_name(val) \ __print_symbolic(val, TICK_DEP_NAMES) TRACE_EVENT(tick_stop, TP_PROTO(int success, int dependency), TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) __field( int , dependency ) ), TP_fast_assign( __entry->success = success; __entry->dependency = dependency; ), TP_printk("success=%d dependency=%s", __entry->success, \ show_tick_dep_name(__entry->dependency)) ); #endif #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
148 1549 1550 148 148 140 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) sgid_attr = ERR_PTR(-ENODEV); rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { struct sockaddr_in any_in = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))) return -EINVAL; } /* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list. */ if (id_priv->reuseaddr) { mutex_lock(&lock); ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret) id_priv->reuseaddr = 0; mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else { ret = cma_listen_on_all(id_priv); if (ret) goto err; } return 0; err: id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run. */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, struct sockaddr *addr, const struct sockaddr *daddr) { struct sockaddr *id_daddr; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) else if (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net; id_priv->afonly = net->ipv6.sysctl.bindv6only; } #endif } id_daddr = cma_dst_addr(id_priv); if (daddr != id_daddr) memcpy(id_daddr, daddr, rdma_addr_size(addr)); id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; if (!cma_any_addr(addr)) rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct sockaddr_storage zero_sock = {}; if (src_addr && src_addr->sa_family) return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); /* * When the src_addr is not specified, automatically supply an any addr */ zero_sock.ss_family = dst_addr->sa_family; if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)&zero_sock; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *)&zero_sock)->sib_pkey = ((struct sockaddr_ib *)dst_addr)->sib_pkey; } return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); } /* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored. */ static int resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))) return -EINVAL; } else { memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { ret = -EINVAL; goto err_state; } return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip(). */ if (id_priv->used_resolve_ip) rdma_addr_cancel(&id->route.addr.dev_addr); else id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, false, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr, rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; req.primary_path_inbound = route->path_rec_inbound; req.primary_path_outbound = route->path_rec_outbound; if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; req.ece.vendor_id = id_priv->ece.vendor_id; req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } /** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. */ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto err_state; return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect_locked); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; mutex_lock(&id_priv->handler_mutex); ret = rdma_connect_locked(id, conn_param); mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); /** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation. */ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_connect(id, conn_param); } EXPORT_SYMBOL(rdma_connect_ece); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { if (qkey) ret = cma_set_qkey(id_priv, qkey); else ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; } rep.private_data = private_data; rep.private_data_len = private_data_len; trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; lockdep_assert_held(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_accept(id, conn_param); } EXPORT_SYMBOL(rdma_accept_ece); void rdma_lock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_lock_handler); void rdma_unlock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_unlock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else { ret = -ENOSYS; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) trace_cm_sent_drep(id_priv); } else { trace_cm_sent_dreq(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, struct ib_sa_multicast *multicast, struct rdma_cm_event *event, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr; enum ib_gid_type gid_type; struct net_device *ndev; if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); event->status = status; event->param.ud.private_data = mc->context; if (status) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; return; } dev_addr = &id_priv->id.route.addr.dev_addr; ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); gid_type = id_priv->cma_dev ->default_gid_type[id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; event->event = RDMA_CM_EVENT_MULTICAST_JOIN; if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event->param.ud.ah_attr)) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; goto out; } event->param.ud.qp_num = 0xFFFFFF; event->param.ud.qkey = id_priv->qkey; out: dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct cma_multicast *mc = multicast->context; struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); if (!ret) { cma_make_mc_event(status, id_priv, multicast, &event, mc); ret = cma_cm_event_handler(id_priv, &event); } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; if (!id_priv->qkey) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; struct ib_sa_multicast ib; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr(addr)) return -EINVAL; gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) return -ENODEV; ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !ib.rec.mtu) return err ?: -EINVAL; if (!id_priv->qkey) cma_set_default_qkey(id_priv); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); queue_work(cma_wq, &mc->iboe_join.work); return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; /* Not supported for kernel QPs */ if (WARN_ON(id->qp)) return -EINVAL; /* ULP is calling this wrong. */ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); if (ret) goto out_err; } else { ret = -ENOSYS; goto out_err; } spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); return 0; out_err: kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) continue; list_del(&mc->list); spin_unlock_irq(&id_priv->lock); WARN_ON(id_priv->cma_dev->device != id->device); destroy_mc(id_priv, mc); return; } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { pr_info("RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static void cma_netevent_work_handler(struct work_struct *_work) { struct rdma_id_private *id_priv = container_of(_work, struct rdma_id_private, id.net_work); struct rdma_cm_event event = {}; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; if (cma_cm_event_handler(id_priv, &event)) { __acquire(&id_priv->handler_mutex); id_priv->cm_id.ib = NULL; cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); return; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); } static int cma_netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct id_table_entry *ips_node = NULL; struct rdma_id_private *current_id; struct neighbour *neigh = ctx; unsigned long flags; if (event != NETEVENT_NEIGH_UPDATE) return NOTIFY_DONE; spin_lock_irqsave(&id_table_lock, flags); if (neigh->tbl->family == AF_INET6) { struct sockaddr_in6 neigh_sock_6; neigh_sock_6.sin6_family = AF_INET6; neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_6); } else if (neigh->tbl->family == AF_INET) { struct sockaddr_in neigh_sock_4; neigh_sock_4.sin_family = AF_INET; neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_4); } else goto out; if (!ips_node) goto out; list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); queue_work(cma_wq, &current_id->id.net_work); } out: spin_unlock_irqrestore(&id_table_lock, flags); return NOTIFY_DONE; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static struct notifier_block cma_netevent_cb = { .notifier_call = cma_netevent_callback }; static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; enum rdma_cm_state state; unsigned long flags; mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */ spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); return; } id_priv->state = RDMA_CM_DEVICE_REMOVAL; spin_unlock_irqrestore(&id_priv->lock, flags); if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently */ cma_id_put(id_priv); mutex_unlock(&id_priv->handler_mutex); trace_cm_id_destroy(id_priv); _destroy_id(id_priv, state); return; } mutex_unlock(&id_priv->handler_mutex); /* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel. */ cma_cancel_operation(id_priv, state); cma_id_put(id_priv); } static void cma_process_remove(struct cma_device *cma_dev) { mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { struct rdma_id_private *id_priv = list_first_entry( &cma_dev->id_list, struct rdma_id_private, device_item); list_del_init(&id_priv->listen_item); list_del_init(&id_priv->device_item); cma_id_get(id_priv); mutex_unlock(&lock); cma_send_device_removal_put(id_priv); mutex_lock(&lock); } mutex_unlock(&lock); cma_dev_put(cma_dev); wait_for_completion(&cma_dev->comp); } static bool cma_supported(struct ib_device *device) { u32 i; rdma_for_each_port(device, i) { if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) return true; } return false; } static int cma_add_one(struct ib_device *device) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned long supported_gids = 0; int ret; u32 i; if (!cma_supported(device)) return -EOPNOTSUPP; cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { ret = -ENOMEM; goto free_cma_dev; } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); if (!cma_dev->default_roce_tos) { ret = -ENOMEM; goto free_gid_type; } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) cma_dev->default_gid_type[i - rdma_start_port(device)] = CMA_PREFERRED_ROCE_GID_TYPE; else cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) goto free_listen; } mutex_unlock(&lock); trace_cm_add_one(device); return 0; free_listen: list_del(&cma_dev->list); mutex_unlock(&lock); /* cma_process_remove() will delete to_destroy */ cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); return ret; } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); xa_init(&pernet->tcp_ps); xa_init(&pernet->udp_ps); xa_init(&pernet->ipoib_ps); xa_init(&pernet->ib_ps); return 0; } static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); WARN_ON(!xa_empty(&pernet->tcp_ps)); WARN_ON(!xa_empty(&pernet->udp_ps)); WARN_ON(!xa_empty(&pernet->ipoib_ps)); WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { .init = cma_init_net, .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), }; static int __init cma_init(void) { int ret; /* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { rtnl_lock(); mutex_lock(&lock); mutex_unlock(&lock); rtnl_unlock(); } cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq; ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; ret = cma_configfs_init(); if (ret) goto err_ib; return 0; err_ib: ib_unregister_client(&cma_client); err: unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } module_init(cma_init); module_exit(cma_cleanup);
1527 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry **entries; }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(sizeof(*utn), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); if (!utn->entries) goto err_free_utn; for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn->entries); err_free_utn: kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn->entries); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL");
1191 1191 1191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dax.c - Direct Access filesystem code * Copyright (c) 2013-2014 Intel Corporation * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> * Author: Ross Zwisler <ross.zwisler@linux.intel.com> */ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) { int i; for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) init_waitqueue_head(wait_table + i); return 0; } fs_initcall(init_dax_wait_table); /* * DAX pagecache entries use XArray value entries so they can't be mistaken * for pages. We use one bit for locking, one bit for the entry size (PMD) * and two more to tell us if the entry is a zero page or an empty entry that * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ #define DAX_SHIFT (4) #define DAX_LOCKED (1UL << 0) #define DAX_PMD (1UL << 1) #define DAX_ZERO_PAGE (1UL << 2) #define DAX_EMPTY (1UL << 3) static unsigned long dax_to_pfn(void *entry) { return xa_to_value(entry) >> DAX_SHIFT; } static void *dax_make_entry(pfn_t pfn, unsigned long flags) { return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } static bool dax_is_locked(void *entry) { return xa_to_value(entry) & DAX_LOCKED; } static unsigned int dax_entry_order(void *entry) { if (xa_to_value(entry) & DAX_PMD) return PMD_ORDER; return 0; } static unsigned long dax_is_pmd_entry(void *entry) { return xa_to_value(entry) & DAX_PMD; } static bool dax_is_pte_entry(void *entry) { return !(xa_to_value(entry) & DAX_PMD); } static int dax_is_zero_entry(void *entry) { return xa_to_value(entry) & DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) { return xa_to_value(entry) & DAX_EMPTY; } /* * true if the entry that was found is of a smaller order than the entry * we were looking for */ static bool dax_is_conflict(void *entry) { return entry == XA_RETRY_ENTRY; } /* * DAX page cache entry locking */ struct exceptional_entry_key { struct xarray *xa; pgoff_t entry_start; }; struct wait_exceptional_entry_queue { wait_queue_entry_t wait; struct exceptional_entry_key key; }; /** * enum dax_wake_mode: waitqueue wakeup behaviour * @WAKE_ALL: wake all waiters in the waitqueue * @WAKE_NEXT: wake only the first waiter in the waitqueue */ enum dax_wake_mode { WAKE_ALL, WAKE_NEXT, }; static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { unsigned long hash; unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait * queue to the start of that PMD. This ensures that all offsets in * the range covered by the PMD map to the same bit lock. */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; key->xa = xas->xa; key->entry_start = index; hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); if (key->xa != ewait->key.xa || key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); } /* * @entry may no longer be the entry at the index in the mapping. * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ static void dax_wake_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode) { struct exceptional_entry_key key; wait_queue_head_t *wq; wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); } /* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() * if it did. The entry returned may have a larger order than @order. * If @order is larger than the order of the entry found in i_pages, this * function returns a dax_is_conflict entry. * * Must be called with the i_pages lock held. */ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) { void *entry; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; for (;;) { entry = xas_find_conflict(xas); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) return entry; if (dax_entry_order(entry) < order) return XA_RETRY_ENTRY; if (!dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); xas_reset(xas); schedule(); finish_wait(wq, &ewait.wait); xas_lock_irq(xas); } } /* * The only thing keeping the address space around is the i_pages lock * (it's cycled in clear_inode() after removing the entries from i_pages) * After we call xas_unlock_irq(), we cannot touch xas->xa. */ static void wait_entry_unlocked(struct xa_state *xas, void *entry) { struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; wq = dax_entry_waitqueue(xas, entry, &ewait.key); /* * Unlike get_unlocked_entry() there is no guarantee that this * path ever successfully retrieves an unlocked entry before an * inode dies. Perform a non-exclusive wait in case this path * never successfully performs its own wake up. */ prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); } static void put_unlocked_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode) { if (entry && !dax_is_conflict(entry)) dax_wake_entry(xas, entry, mode); } /* * We used the xa_state to get the entry, but then we locked the entry and * dropped the xa_lock, so we know the xa_state is stale and must be reset * before use. */ static void dax_unlock_entry(struct xa_state *xas, void *entry) { void *old; BUG_ON(dax_is_locked(entry)); xas_reset(xas); xas_lock_irq(xas); old = xas_store(xas, entry); xas_unlock_irq(xas); BUG_ON(!dax_is_locked(old)); dax_wake_entry(xas, entry, WAKE_NEXT); } /* * Return: The entry stored at this location before it was locked. */ static void *dax_lock_entry(struct xa_state *xas, void *entry) { unsigned long v = xa_to_value(entry); return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } static unsigned long dax_entry_size(void *entry) { if (dax_is_zero_entry(entry)) return 0; else if (dax_is_empty_entry(entry)) return 0; else if (dax_is_pmd_entry(entry)) return PMD_SIZE; else return PAGE_SIZE; } static unsigned long dax_end_pfn(void *entry) { return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; } /* * Iterate through all mapped pfns represented by an entry, i.e. skip * 'empty' and 'zero' entries. */ #define for_each_mapped_pfn(entry, pfn) \ for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) static inline bool dax_page_is_shared(struct page *page) { return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the * refcount. */ static inline void dax_page_share_get(struct page *page) { if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) page->share = 1; page->mapping = PAGE_MAPPING_DAX_SHARED; } page->share++; } static inline unsigned long dax_page_share_put(struct page *page) { return --page->share; } /* * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; index = linear_page_index(vma, address & ~(size - 1)); for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); if (shared) { dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; page->index = index + i++; } } } static void dax_disassociate_entry(void *entry, struct address_space *mapping, bool trunc) { unsigned long pfn; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); if (dax_page_is_shared(page)) { /* keep the shared flag if this page is still shared */ if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); page->mapping = NULL; page->index = 0; } } static struct page *dax_busy_page(void *entry) { unsigned long pfn; for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); if (page_ref_count(page) > 1) return page; } return NULL; } /* * dax_lock_page - Lock the DAX entry corresponding to a page * @page: The page whose entry we want to lock * * Context: Process context. * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could * not be locked. */ dax_entry_t dax_lock_page(struct page *page) { XA_STATE(xas, NULL, 0); void *entry; /* Ensure page->mapping isn't freed while we look at it */ rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(page->mapping); entry = NULL; if (!mapping || !dax_mapping(mapping)) break; /* * In the device-dax case there's no need to lock, a * struct dev_pagemap pin is sufficient to keep the * inode alive, and we assume we have dev_pagemap pin * otherwise we would not have a valid pfn_to_page() * translation. */ entry = (void *)~0UL; if (S_ISCHR(mapping->host->i_mode)) break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); if (mapping != page->mapping) { xas_unlock_irq(&xas); continue; } xas_set(&xas, page->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); wait_entry_unlocked(&xas, entry); rcu_read_lock(); continue; } dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); break; } rcu_read_unlock(); return (dax_entry_t)entry; } void dax_unlock_page(struct page *page, dax_entry_t cookie) { struct address_space *mapping = page->mapping; XA_STATE(xas, &mapping->i_pages, page->index); if (S_ISCHR(mapping->host->i_mode)) return; dax_unlock_entry(&xas, (void *)cookie); } /* * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping * @mapping: the file's mapping whose entry we want to lock * @index: the offset within this file * @page: output the dax page corresponding to this dax entry * * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry * could not be locked. */ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, struct page **page) { XA_STATE(xas, NULL, 0); void *entry; rcu_read_lock(); for (;;) { entry = NULL; if (!dax_mapping(mapping)) break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); xas_set(&xas, index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); wait_entry_unlocked(&xas, entry); rcu_read_lock(); continue; } if (!entry || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Because we are looking for entry from file's mapping * and index, so the entry may not be inserted for now, * or even a zero/empty entry. We don't think this is * an error case. So, return a special value and do * not output @page. */ entry = (void *)~0UL; } else { *page = pfn_to_page(dax_to_pfn(entry)); dax_lock_entry(&xas, entry); } xas_unlock_irq(&xas); break; } rcu_read_unlock(); return (dax_entry_t)entry; } void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, dax_entry_t cookie) { XA_STATE(xas, &mapping->i_pages, index); if (cookie == ~0UL) return; dax_unlock_entry(&xas, (void *)cookie); } /* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return VM_FAULT_FALLBACK. * This will happen if there are any PTE entries within the PMD range * that we are requesting. * * We always favor PTE entries over PMD entries. There isn't a flow where we * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD * insertion will fail if it finds any PTE entries already in the tree, and a * PTE insertion will cause an existing PMD entry to be unmapped and * downgraded to PTE entries. This happens for both PMD zero pages as * well as PMD empty entries. * * The exception to this downgrade path is for PMD entries that have * real storage backing them. We will leave these real PMD entries in * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. * * On error, this function does not return an ERR_PTR. Instead it returns * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values * overlap with xarray value entries. */ static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned int order) { unsigned long index = xas->xa_index; bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ void *entry; retry: pmd_downgrade = false; xas_lock_irq(xas); entry = get_unlocked_entry(xas, order); if (entry) { if (dax_is_conflict(entry)) goto fallback; if (!xa_is_value(entry)) { xas_set_err(xas, -EIO); goto out_unlock; } if (order == 0) { if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; } } } if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop * the i_pages lock. */ dax_lock_entry(xas, entry); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ if (dax_is_zero_entry(entry)) { xas_unlock_irq(xas); unmap_mapping_pages(mapping, xas->xa_index & ~PG_PMD_COLOUR, PG_PMD_NR, false); xas_reset(xas); xas_lock_irq(xas); } dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, WAKE_ALL); mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); } if (entry) { dax_lock_entry(xas, entry); } else { unsigned long flags = DAX_EMPTY; if (order > 0) flags |= DAX_PMD; entry = dax_make_entry(pfn_to_pfn_t(0), flags); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; mapping->nrpages += 1UL << order; } out_unlock: xas_unlock_irq(xas); if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) goto retry; if (xas->xa_node == XA_ERROR(-ENOMEM)) return xa_mk_internal(VM_FAULT_OOM); if (xas_error(xas)) return xa_mk_internal(VM_FAULT_SIGBUS); return entry; fallback: xas_unlock_irq(xas); return xa_mk_internal(VM_FAULT_FALLBACK); } /** * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 * @start: Starting offset. Page containing 'start' is included. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when * page->count == 1. A filesystem uses this interface to determine if * any page in the mapping is busy, i.e. for DMA, or other * get_user_pages() usages. * * It is expected that the filesystem is holding locks to block the * establishment of new mappings in this address_space. I.e. it expects * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true. */ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end) { void *entry; unsigned int scanned = 0; struct page *page = NULL; pgoff_t start_idx = start >> PAGE_SHIFT; pgoff_t end_idx; XA_STATE(xas, &mapping->i_pages, start_idx); /* * In the 'limited' case get_user_pages() for dax is disabled. */ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return NULL; if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; /* If end == LLONG_MAX, all pages from start to till end of file */ if (end == LLONG_MAX) end_idx = ULONG_MAX; else end_idx = end >> PAGE_SHIFT; /* * If we race get_user_pages_fast() here either we'll see the * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by * pte_lock() and pmd_lock(). New references are not taken without * holding those locks, and unmap_mapping_pages() will not zero the * pte or pmd without holding the respective lock, so we are * guaranteed to either see new references or prevent new * references from being established. */ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); xas_lock_irq(&xas); xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; if (unlikely(dax_is_locked(entry))) entry = get_unlocked_entry(&xas, 0); if (entry) page = dax_busy_page(entry); put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); return page; } EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); struct page *dax_layout_busy_page(struct address_space *mapping) { return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; xas_lock_irq(&xas); entry = get_unlocked_entry(&xas, 0); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: put_unlocked_entry(&xas, entry, WAKE_ALL); xas_unlock_irq(&xas); return ret; } static int __dax_clear_dirty_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); unsigned int scanned = 0; void *entry; xas_lock_irq(&xas); xas_for_each(&xas, entry, end) { entry = get_unlocked_entry(&xas, 0); xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); put_unlocked_entry(&xas, entry, WAKE_NEXT); if (++scanned % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); return 0; } /* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { int ret = __dax_invalidate_entry(mapping, index, true); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the * page cache (usually fs-private i_mmap_sem for writing). Since the * caller has seen a DAX entry for this index, we better find it * at that index as well... */ WARN_ON_ONCE(!ret); return ret; } /* * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { return __dax_invalidate_entry(mapping, index, false); } static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) { return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); } static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) { pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); void *vto, *kaddr; long rc; int id; id = dax_read_lock(); rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; } vto = kmap_atomic(vmf->cow_page); copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); kunmap_atomic(vto); dax_read_unlock(id); return 0; } /* * MAP_SYNC on a dax mapping guarantees dirty metadata is * flushed on write-faults (non-cow), but not read-faults. */ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, struct vm_area_struct *vma) { return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && (iter->iomap.flags & IOMAP_F_DIRTY); } /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to * PTEs. If we happen to be trying to insert a PTE and there is a PMD * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void *entry, pfn_t pfn, unsigned long flags) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); bool write = iter->flags & IOMAP_WRITE; bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, PG_PMD_NR, false); else /* pte entry */ unmap_mapping_pages(mapping, index, 1, false); } xas_reset(xas); xas_lock_irq(xas); if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ old = dax_lock_entry(xas, new_entry); WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | DAX_LOCKED)); entry = new_entry; } else { xas_load(xas); /* Walk the xa_state */ } if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); return entry; } static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { unsigned long pfn, index, count, end; long ret = 0; struct vm_area_struct *vma; /* * A page got tagged dirty in DAX mapping? Something is seriously * wrong. */ if (WARN_ON(!xa_is_value(entry))) return -EIO; if (unlikely(dax_is_locked(entry))) { void *old_entry = entry; entry = get_unlocked_entry(xas, 0); /* Entry got punched out / reallocated? */ if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. * We have to compare pfns as we must not bail out due to * difference in lockbit or entry type. */ if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { ret = -EIO; goto put_unlocked; } /* Another fsync thread may have already done this entry */ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; } /* Lock the entry to serialize with page faults */ dax_lock_entry(xas, entry); /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock. */ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); /* * If dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we use needs to be * aligned to the start of the PMD. * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ pfn = dax_to_pfn(entry); count = 1UL << dax_entry_order(entry); index = xas->xa_index & ~(count - 1); end = index + count - 1; /* Walk all mappings of a given index of a file and writeprotect them */ i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { pfn_mkclean_range(pfn, count, index, vma); cond_resched(); } i_mmap_unlock_read(mapping); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ xas_reset(xas); xas_lock_irq(xas); xas_store(xas, entry); xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); dax_wake_entry(xas, entry, WAKE_NEXT); trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: put_unlocked_entry(xas, entry, WAKE_NEXT); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc) { XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; void *entry; int ret = 0; unsigned int scanned = 0; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0; trace_dax_writeback_range(inode, xas.xa_index, end_index); tag_pages_for_writeback(mapping, xas.xa_index, end_index); xas_lock_irq(&xas); xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { ret = dax_writeback_one(&xas, dax_dev, mapping, entry); if (ret < 0) { mapping_set_error(mapping, ret); break; } if (++scanned % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); trace_dax_writeback_range_done(inode, xas.xa_index, end_index); return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, size_t size, void **kaddr, pfn_t *pfnp) { pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id, rc = 0; long length; id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, kaddr, pfnp); if (length < 0) { rc = length; goto out; } if (!pfnp) goto out_check_addr; rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) goto out; /* For larger pages we need devmap */ if (length > 1 && !pfn_t_devmap(*pfnp)) goto out; rc = 0; out_check_addr: if (!kaddr) goto out; if (!*kaddr) rc = -EFAULT; out: dax_read_unlock(id); return rc; } /** * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) * @srcmap: iomap srcmap * @daddr: destination address to copy to. * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of * aligned ranges is taken care by dax_iomap_iter() itself. * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the * area to make sure no old data remains. */ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN; void *saddr = 0; int ret = 0; if (!zero_edge) { ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); if (ret) return dax_mem2blk_err(ret); } if (copy_all) { if (zero_edge) memset(daddr, 0, size); else ret = copy_mc_to_kernel(daddr, saddr, length); goto out; } /* Copy the head part of the range */ if (head_off) { if (zero_edge) memset(daddr, 0, head_off); else { ret = copy_mc_to_kernel(daddr, saddr, head_off); if (ret) return -EIO; } } /* Copy the tail part of the range */ if (end < pg_end) { loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; if (zero_edge) memset(daddr + tail_off, 0, tail_len); else { ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, tail_len); if (ret) return -EIO; } } out: if (zero_edge) dax_flush(srcmap->dax_dev, daddr, size); return ret ? -EIO : 0; } /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with * sparse files. Instead we insert a read-only mapping of the 4k zero page. * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); return ret; } #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; pgtable_t pgtable = NULL; struct page *zero_page; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); if (unlikely(!zero_page)) goto fallback; pfn = page_to_pfn_t(zero_page); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE); if (arch_needs_pgtable_deposit()) { pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); goto fallback; } if (pgtable) { pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: if (pgtable) pte_free(vma->vm_mm, pgtable); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ static s64 dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); int id = 0; s64 ret = 0; void *daddr = NULL, *saddr = NULL; /* don't bother with blocks that are not shared to start with */ if (!(iomap->flags & IOMAP_F_SHARED)) return length; id = dax_read_lock(); ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); if (ret < 0) goto out_unlock; /* zero the distance if srcmap is HOLE or UNWRITTEN */ if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { memset(daddr, 0, length); dax_flush(iomap->dax_dev, daddr, length); ret = length; goto out_unlock; } ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); if (ret < 0) goto out_unlock; if (copy_mc_to_kernel(daddr, saddr, length) == 0) ret = length; else ret = -EIO; out_unlock: dax_read_unlock(id); return dax_mem2blk_err(ret); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, }; int ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); unsigned offset = offset_in_page(pos); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); void *kaddr; long ret; ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret < 0) return dax_mem2blk_err(ret); memset(kaddr + offset, 0, size); if (iomap->flags & IOMAP_F_SHARED) ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, kaddr); else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; u64 length = iomap_length(iter); s64 written = 0; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return length; /* * invalidate the pages whose sharing state is to be changed * because of CoW. */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, pos >> PAGE_SHIFT, (pos + length - 1) >> PAGE_SHIFT); do { unsigned offset = offset_in_page(pos); unsigned size = min_t(u64, PAGE_SIZE - offset, length); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); long rc; int id; id = dax_read_lock(); if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else rc = dax_memzero(iter, pos, size); dax_read_unlock(id); if (rc < 0) return rc; pos += size; length -= size; written += size; } while (length > 0); if (did_zero) *did_zero = true; return written; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_DAX | IOMAP_ZERO, }; int ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); /* Block boundary? Nothing to do */ if (!off) return 0; return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); } EXPORT_SYMBOL_GPL(dax_truncate_page); static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; if (!write) { end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) return iov_iter_zero(min(length, end - pos), iter); } /* * In DAX mode, enforce either pure overwrites of written extents, or * writes to unwritten extents as part of a copy-on-write operation. */ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED))) return -EIO; /* * Write can allocate block for an area which has a hole page mapped * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW || cow) { /* * Filesystem allows CoW on non-shared extents. The src extents * may have been mmapped with dirty mark before. To be able to * invalidate its dax entries, we need to clear the dirty mark * in advance. */ if (cow) __dax_clear_dirty_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } id = dax_read_lock(); while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { ret = -EINTR; break; } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_RECOVERY_WRITE, &kaddr, NULL); if (map_len > 0) recovery = true; } if (map_len < 0) { ret = dax_mem2blk_err(map_len); break; } if (cow) { ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, srcmap, kaddr); if (ret) break; } map_len = PFN_PHYS(map_len); kaddr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (recovery) xfer = dax_recovery_write(dax_dev, pgoff, kaddr, map_len, iter); else if (write) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); pos += xfer; length -= xfer; done += xfer; if (xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); return done ? done : ret; } /** * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O. */ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops) { struct iomap_iter iomi = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(iter), .flags = IOMAP_DAX, }; loff_t done = 0; int ret; if (!iomi.len) return 0; if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_write(&iomi.inode->i_rwsem); iomi.flags |= IOMAP_WRITE; } else { lockdep_assert_held(&iomi.inode->i_rwsem); } if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) iomi.processed = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; return done ? done : ret; } EXPORT_SYMBOL_GPL(dax_iomap_rw); static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; return vmf_error(error); } /* * When handling a synchronous page fault and the inode need a fsync, we can * insert the PTE/PMD into page tables only after that fsync happened. Skip * insertion for now and return the pfn so that caller can insert it after the * fsync is done. */ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) { if (WARN_ON_ONCE(!pfnp)) return VM_FAULT_SIGBUS; *pfnp = pfn; return VM_FAULT_NEEDDSYNC; } static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, const struct iomap_iter *iter) { vm_fault_t ret; int error = 0; switch (iter->iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: clear_user_highpage(vmf->cow_page, vmf->address); break; case IOMAP_MAPPED: error = copy_cow_page_dax(vmf, iter); break; default: WARN_ON_ONCE(1); error = -EIO; break; } if (error) return dax_fault_return(error); __SetPageUptodate(vmf->cow_page); ret = finish_fault(vmf); if (!ret) return VM_FAULT_DONE_COW; return ret; } /** * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. * @vmf: vm fault instance * @iter: iomap iter * @pfnp: pfn to be returned * @xas: the dax mapping tree of a file * @entry: an unlocked dax entry to be inserted * @pmd: distinguish whether it is a pmd fault */ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, const struct iomap_iter *iter, pfn_t *pfnp, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; unsigned long entry_flags = pmd ? DAX_PMD : 0; int err = 0; pfn_t pfn; void *kaddr; if (!pmd && vmf->cow_page) return dax_fault_cow_page(vmf, iter); /* if we are reading UNWRITTEN and HOLE, return a hole. */ if (!write && (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { if (!pmd) return dax_load_hole(xas, vmf, iter, entry); return dax_pmd_load_hole(xas, vmf, iter, entry); } if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { WARN_ON_ONCE(1); return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; } err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); if (write && iomap->flags & IOMAP_F_SHARED) { err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } if (dax_fault_is_synchronous(iter, vmf->vma)) return dax_fault_synchronous_pfnp(pfnp, pfn); /* insert PMD pfn */ if (pmd) return vmf_insert_pfn_pmd(vmf, pfn, write); /* insert PTE pfn */ if (write) return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); return vmf_insert_mixed(vmf->vma, vmf->address, pfn); } static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct iomap_iter iter = { .inode = mapping->host, .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, .len = PAGE_SIZE, .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = 0; void *entry; int error; trace_dax_pte_fault(iter.inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (iter.pos >= i_size_read(iter.inode)) { ret = VM_FAULT_SIGBUS; goto out; } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) iter.flags |= IOMAP_WRITE; entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { ret = xa_to_internal(entry); goto out; } /* * It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PMD fault that overlaps with * the PTE we need to set up. If so just return and the fault will be * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto unlock_entry; } while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { iter.processed = -EIO; /* fs corruption? */ continue; } ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); if (ret != VM_FAULT_SIGBUS && (iter.iomap.flags & IOMAP_F_NEW)) { count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret |= VM_FAULT_MAJOR; } if (!(ret & VM_FAULT_ERROR)) iter.processed = PAGE_SIZE; } if (iomap_errp) *iomap_errp = error; if (!ret && error) ret = dax_fault_return(error); unlock_entry: dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(iter.inode, vmf, ret); return ret; } #ifdef CONFIG_FS_DAX_PMD static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, pgoff_t max_pgoff) { unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; /* * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD * range in the page cache. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) return true; /* Fall back to PTEs if we're going to COW */ if (write && !(vmf->vma->vm_flags & VM_SHARED)) return true; /* If the PMD would extend outside the VMA */ if (pmd_addr < vmf->vma->vm_start) return true; if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) return true; /* If the PMD would extend beyond the file size */ if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) return true; return false; } static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); struct iomap_iter iter = { .inode = mapping->host, .len = PMD_SIZE, .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; void *entry; if (vmf->flags & FAULT_FLAG_WRITE) iter.flags |= IOMAP_WRITE; /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); if (xas.xa_index >= max_pgoff) { ret = VM_FAULT_SIGBUS; goto out; } if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) goto fallback; /* * grab_mapping_entry() will make sure we get an empty PMD entry, * a zero PMD entry or a DAX PMD. If it can't (because a PTE * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK. */ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) { ret = xa_to_internal(entry); goto fallback; } /* * It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PTE fault that overlaps with * the PMD we need to set up. If so just return and the fault will be * retried. */ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && !pmd_devmap(*vmf->pmd)) { ret = 0; goto unlock_entry; } iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; while (iomap_iter(&iter, ops) > 0) { if (iomap_length(&iter) < PMD_SIZE) continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); if (ret != VM_FAULT_FALLBACK) iter.processed = PMD_SIZE; } unlock_entry: dax_unlock_entry(&xas, entry); fallback: if (ret == VM_FAULT_FALLBACK) { split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); return ret; } #else static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller * has done all the necessary locking for page fault to proceed * successfully. */ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); else return VM_FAULT_FALLBACK; } EXPORT_SYMBOL_GPL(dax_iomap_fault); /* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault * @pfn: PFN to insert * @order: Order of entry to insert. * * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty. */ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); void *entry; vm_fault_t ret; xas_lock_irq(&xas); entry = get_unlocked_entry(&xas, order); /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); if (order == 0) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); #ifdef CONFIG_FS_DAX_PMD else if (order == PMD_ORDER) ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); #endif else ret = VM_FAULT_FALLBACK; dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; } /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); static loff_t dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; void *saddr, *daddr; int id, ret; len = min(len, min(smap->length, dmap->length)); if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; return len; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { *same = false; return 0; } id = dax_read_lock(); ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), &saddr, NULL); if (ret < 0) goto out_unlock; ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), &daddr, NULL); if (ret < 0) goto out_unlock; *same = !memcmp(saddr, daddr, len); if (!*same) len = 0; dax_read_unlock(id); return len; out_unlock: dax_read_unlock(id); return -EIO; } int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dst, loff_t dstoff, loff_t len, bool *same, const struct iomap_ops *ops) { struct iomap_iter src_iter = { .inode = src, .pos = srcoff, .len = len, .flags = IOMAP_DAX, }; struct iomap_iter dst_iter = { .inode = dst, .pos = dstoff, .len = len, .flags = IOMAP_DAX, }; int ret, compared = 0; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { compared = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); if (compared < 0) return ret; src_iter.processed = dst_iter.processed = compared; } return ret; } int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops) { return __generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags, ops); } EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
28 28 28 28 28 28 28 34 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 // SPDX-License-Identifier: GPL-2.0-only /* * Off-channel operation helpers * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019, 2022-2023 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" /* * Tell our hardware to disable PS. * Optionally inform AP that we will go to sleep so that it will buffer * the frames while we are doing off-channel work. This is optional * because we *may* be doing work on-operating channel, and want our * hardware unconditionally awake, but still let the AP send us normal frames. */ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool offchannel_ps_enabled = false; /* FIXME: what to do when local->pspolling is true? */ del_timer_sync(&local->dynamic_ps_timer); del_timer_sync(&ifmgd->bcn_mon_timer); del_timer_sync(&ifmgd->conn_mon_timer); cancel_work_sync(&local->dynamic_ps_enable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the * hardware is creating the nullfunc frame for power save * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not * enabled) and power save was enabled, the firmware just * sent a null frame with power save disabled. So we need * to send a new nullfunc frame to inform the AP that we * are again sleeping. */ ieee80211_send_nullfunc(local, sdata, true); } /* inform AP that we are awake again */ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; if (!local->ps_sdata) ieee80211_send_nullfunc(local, sdata, false); else if (local->hw.conf.dynamic_ps_timeout > 0) { /* * the dynamic_ps_timer had been running before leaving the * operating channel, restart the timer now and send a nullfunc * frame to inform the AP that we are awake so that AP sends * the buffered packets (if any). */ ieee80211_send_nullfunc(local, sdata, false); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; if (WARN_ON(local->use_chanctx)) return; /* * notify the AP about us leaving the channel and stop all * STA interfaces. */ /* * Stop queues and transmit all frames queued by the driver * before sending nullfunc to enable powersave at the AP. */ ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); ieee80211_flush_queues(local, NULL, false); mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); /* Check to see if we should disable beaconing. */ if (sdata->vif.bss_conf.enable_beacon) { set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); sdata->vif.bss_conf.enable_beacon = false; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata); } mutex_unlock(&local->iflist_mtx); } void ieee80211_offchannel_return(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; if (WARN_ON(local->use_chanctx)) return; mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); if (!ieee80211_sdata_running(sdata)) continue; /* Tell AP we're back */ if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_disable(sdata); if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state)) { sdata->vif.bss_conf.enable_beacon = true; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } } mutex_unlock(&local->iflist_mtx); ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); } static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) { /* was never transmitted */ if (roc->frame) { cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->frame->data, roc->frame->len, false, GFP_KERNEL); ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame); } if (!roc->mgmt_tx_cookie) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); else cfg80211_tx_mgmt_expired(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); } static unsigned long ieee80211_end_finished_rocs(struct ieee80211_local *local, unsigned long now) { struct ieee80211_roc_work *roc, *tmp; long remaining_dur_min = LONG_MAX; lockdep_assert_held(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { long remaining; if (!roc->started) break; remaining = roc->start_time + msecs_to_jiffies(roc->duration) - now; /* In case of HW ROC, it is possible that the HW finished the * ROC session before the actual requested time. In such a case * end the ROC session (disregarding the remaining time). */ if (roc->abort || roc->hw_begun || remaining <= 0) ieee80211_roc_notify_destroy(roc); else remaining_dur_min = min(remaining_dur_min, remaining); } return remaining_dur_min; } static bool ieee80211_recalc_sw_work(struct ieee80211_local *local, unsigned long now) { long dur = ieee80211_end_finished_rocs(local, now); if (dur == LONG_MAX) return false; mod_delayed_work(local->workqueue, &local->roc_work, dur); return true; } static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, unsigned long start_time) { if (WARN_ON(roc->notified)) return; roc->start_time = start_time; roc->started = true; if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, roc->chan->band); roc->frame = NULL; } } else { cfg80211_ready_on_channel(&roc->sdata->wdev, roc->cookie, roc->chan, roc->req_duration, GFP_KERNEL); } roc->notified = true; } static void ieee80211_hw_roc_start(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_start); struct ieee80211_roc_work *roc; mutex_lock(&local->mtx); list_for_each_entry(roc, &local->roc_list, list) { if (!roc->started) break; roc->hw_begun = true; ieee80211_handle_roc_started(roc, local->hw_roc_start_time); } mutex_unlock(&local->mtx); } void ieee80211_ready_on_channel(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); local->hw_roc_start_time = jiffies; trace_api_ready_on_channel(local); ieee80211_queue_work(hw, &local->hw_roc_start); } EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel); static void _ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; enum ieee80211_roc_type type; u32 min_dur, max_dur; lockdep_assert_held(&local->mtx); if (WARN_ON(list_empty(&local->roc_list))) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON(roc->started)) return; min_dur = roc->duration; max_dur = roc->duration; type = roc->type; list_for_each_entry(tmp, &local->roc_list, list) { if (tmp == roc) continue; if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; max_dur = max(tmp->duration, max_dur); min_dur = min(tmp->duration, min_dur); type = max(tmp->type, type); } if (local->ops->remain_on_channel) { int ret = drv_remain_on_channel(local, roc->sdata, roc->chan, max_dur, type); if (ret) { wiphy_warn(local->hw.wiphy, "failed to start next HW ROC (%d)\n", ret); /* * queue the work struct again to avoid recursion * when multiple failures occur */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; tmp->abort = true; } ieee80211_queue_work(&local->hw, &local->hw_roc_done); return; } /* we'll notify about the start once the HW calls back */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; } } else { /* If actually operating on the desired channel (with at least * 20 MHz channel width) don't stop all the operations but still * treat it as though the ROC operation started properly, so * other ROC operations won't interfere with this one. */ roc->on_channel = roc->chan == local->_oper_chandef.chan && local->_oper_chandef.width != NL80211_CHAN_WIDTH_5 && local->_oper_chandef.width != NL80211_CHAN_WIDTH_10; /* start this ROC */ ieee80211_recalc_idle(local); if (!roc->on_channel) { ieee80211_offchannel_stop_vifs(local); local->tmp_channel = roc->chan; ieee80211_hw_config(local, 0); } ieee80211_queue_delayed_work(&local->hw, &local->roc_work, msecs_to_jiffies(min_dur)); /* tell userspace or send frame(s) */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->on_channel = roc->on_channel; ieee80211_handle_roc_started(tmp, jiffies); } } } void ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; lockdep_assert_held(&local->mtx); if (list_empty(&local->roc_list)) { ieee80211_run_deferred_scan(local); return; } /* defer roc if driver is not started (i.e. during reconfig) */ if (local->in_reconfig) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON_ONCE(roc->started)) return; if (local->ops->remain_on_channel) { _ieee80211_start_next_roc(local); } else { /* delay it a bit */ ieee80211_queue_delayed_work(&local->hw, &local->roc_work, round_jiffies_relative(HZ/2)); } } static void __ieee80211_roc_work(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; bool on_channel; lockdep_assert_held(&local->mtx); if (WARN_ON(local->ops->remain_on_channel)) return; roc = list_first_entry_or_null(&local->roc_list, struct ieee80211_roc_work, list); if (!roc) return; if (!roc->started) { WARN_ON(local->use_chanctx); _ieee80211_start_next_roc(local); } else { on_channel = roc->on_channel; if (ieee80211_recalc_sw_work(local, jiffies)) return; /* careful - roc pointer became invalid during recalc */ if (!on_channel) { ieee80211_flush_queues(local, NULL, false); local->tmp_channel = NULL; ieee80211_hw_config(local, 0); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_start_next_roc(local); } } static void ieee80211_roc_work(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, roc_work.work); mutex_lock(&local->mtx); __ieee80211_roc_work(local); mutex_unlock(&local->mtx); } static void ieee80211_hw_roc_done(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_done); mutex_lock(&local->mtx); ieee80211_end_finished_rocs(local, jiffies); /* if there's another roc, start it now */ ieee80211_start_next_roc(local); mutex_unlock(&local->mtx); } void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_remain_on_channel_expired(local); ieee80211_queue_work(hw, &local->hw_roc_done); } EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired); static bool ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { unsigned long now = jiffies; unsigned long remaining; if (WARN_ON(!cur_roc->started)) return false; /* if it was scheduled in the hardware, but not started yet, * we can only combine if the older one had a longer duration */ if (!cur_roc->hw_begun && new_roc->duration > cur_roc->duration) return false; remaining = cur_roc->start_time + msecs_to_jiffies(cur_roc->duration) - now; /* if it doesn't fit entirely, schedule a new one */ if (new_roc->duration > jiffies_to_msecs(remaining)) return false; /* add just after the current one so we combine their finish later */ list_add(&new_roc->list, &cur_roc->list); /* if the existing one has already begun then let this one also * begin, otherwise they'll both be marked properly by the work * struct that runs once the driver notifies us of the beginning */ if (cur_roc->hw_begun) { new_roc->hw_begun = true; ieee80211_handle_roc_started(new_roc, now); } return true; } static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, unsigned int duration, u64 *cookie, struct sk_buff *txskb, enum ieee80211_roc_type type) { struct ieee80211_roc_work *roc, *tmp; bool queued = false, combine_started = true; int ret; lockdep_assert_held(&local->mtx); if (channel->freq_offset) /* this may work, but is untested */ return -EOPNOTSUPP; if (local->use_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; roc = kzalloc(sizeof(*roc), GFP_KERNEL); if (!roc) return -ENOMEM; /* * If the duration is zero, then the driver * wouldn't actually do anything. Set it to * 10 for now. * * TODO: cancel the off-channel operation * when we get the SKB's TX status and * the wait time was zero before. */ if (!duration) duration = 10; roc->chan = channel; roc->duration = duration; roc->req_duration = duration; roc->frame = txskb; roc->type = type; roc->sdata = sdata; /* * cookie is either the roc cookie (for normal roc) * or the SKB (for mgmt TX) */ if (!txskb) { roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { roc->mgmt_tx_cookie = *cookie; } /* if there's no need to queue, handle it immediately */ if (list_empty(&local->roc_list) && !local->scanning && !ieee80211_is_radar_required(local)) { /* if not HW assist, just queue & schedule work */ if (!local->ops->remain_on_channel) { list_add_tail(&roc->list, &local->roc_list); ieee80211_queue_delayed_work(&local->hw, &local->roc_work, 0); } else { /* otherwise actually kick it off here * (for error handling) */ ret = drv_remain_on_channel(local, sdata, channel, duration, type); if (ret) { kfree(roc); return ret; } roc->started = true; list_add_tail(&roc->list, &local->roc_list); } return 0; } /* otherwise handle queueing */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->chan != channel || tmp->sdata != sdata) continue; /* * Extend this ROC if possible: If it hasn't started, add * just after the new one to combine. */ if (!tmp->started) { list_add(&roc->list, &tmp->list); queued = true; break; } if (!combine_started) continue; if (!local->ops->remain_on_channel) { /* If there's no hardware remain-on-channel, and * doing so won't push us over the maximum r-o-c * we allow, then we can just add the new one to * the list and mark it as having started now. * If it would push over the limit, don't try to * combine with other started ones (that haven't * been running as long) but potentially sort it * with others that had the same fate. */ unsigned long now = jiffies; u32 elapsed = jiffies_to_msecs(now - tmp->start_time); struct wiphy *wiphy = local->hw.wiphy; u32 max_roc = wiphy->max_remain_on_channel_duration; if (elapsed + roc->duration > max_roc) { combine_started = false; continue; } list_add(&roc->list, &tmp->list); queued = true; roc->on_channel = tmp->on_channel; ieee80211_handle_roc_started(roc, now); ieee80211_recalc_sw_work(local, now); break; } queued = ieee80211_coalesce_hw_started_roc(local, roc, tmp); if (queued) break; /* if it wasn't queued, perhaps it can be combined with * another that also couldn't get combined previously, * but no need to check for already started ones, since * that can't work. */ combine_started = false; } if (!queued) list_add_tail(&roc->list, &local->roc_list); return 0; } int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; int ret; mutex_lock(&local->mtx); ret = ieee80211_start_roc_work(local, sdata, chan, duration, cookie, NULL, IEEE80211_ROC_TYPE_NORMAL); mutex_unlock(&local->mtx); return ret; } static int ieee80211_cancel_roc(struct ieee80211_local *local, u64 cookie, bool mgmt_tx) { struct ieee80211_roc_work *roc, *tmp, *found = NULL; int ret; if (!cookie) return -ENOENT; flush_work(&local->hw_roc_start); mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) continue; else if (mgmt_tx && roc->mgmt_tx_cookie != cookie) continue; found = roc; break; } if (!found) { mutex_unlock(&local->mtx); return -ENOENT; } if (!found->started) { ieee80211_roc_notify_destroy(found); goto out_unlock; } if (local->ops->remain_on_channel) { ret = drv_cancel_remain_on_channel(local, roc->sdata); if (WARN_ON_ONCE(ret)) { mutex_unlock(&local->mtx); return ret; } /* TODO: * if multiple items were combined here then we really shouldn't * cancel them all - we should wait for as much time as needed * for the longest remaining one, and only then cancel ... */ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (roc == found) found = NULL; ieee80211_roc_notify_destroy(roc); } /* that really must not happen - it was started */ WARN_ON(found); ieee80211_start_next_roc(local); } else { /* go through work struct to return to the operating channel */ found->abort = true; mod_delayed_work(local->workqueue, &local->roc_work, 0); } out_unlock: mutex_unlock(&local->mtx); return 0; } int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; return ieee80211_cancel_roc(local, cookie, false); } int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct sta_info *sta = NULL; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; bool mlo_sta = false; int link_id = -1; u32 flags; int ret; u8 *data; if (params->dont_wait_for_ack) flags = IEEE80211_TX_CTL_NO_ACK; else flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | IEEE80211_TX_CTL_REQ_TX_STATUS; if (params->no_cck) flags |= IEEE80211_TX_CTL_NO_CCK_RATE; switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) need_offchan = true; #ifdef CONFIG_MAC80211_MESH fallthrough; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; #endif fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: if (sdata->vif.type != NL80211_IFTYPE_ADHOC && !ieee80211_vif_is_mesh(&sdata->vif) && !sdata->bss->active) need_offchan = true; rcu_read_lock(); sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; if (!ieee80211_is_action(mgmt->frame_control) || mgmt->u.action.category == WLAN_CATEGORY_PUBLIC || mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED || mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) { rcu_read_unlock(); break; } if (!sta) { rcu_read_unlock(); return -ENOLINK; } if (params->link_id >= 0 && !(sta->sta.valid_links & BIT(params->link_id))) { rcu_read_unlock(); return -ENOLINK; } link_id = params->link_id; rcu_read_unlock(); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: sdata_lock(sdata); if (!sdata->u.mgd.associated || (params->offchan && params->wait && local->ops->remain_on_channel && memcmp(sdata->vif.cfg.ap_addr, mgmt->bssid, ETH_ALEN))) need_offchan = true; sdata_unlock(sdata); break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } /* configurations requiring offchan cannot work if no channel has been * specified */ if (need_offchan && !params->chan) return -EINVAL; mutex_lock(&local->mtx); /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; rcu_read_lock(); /* Check all the links first */ for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *conf; conf = rcu_dereference(sdata->vif.link_conf[i]); if (!conf) continue; chanctx_conf = rcu_dereference(conf->chanctx_conf); if (!chanctx_conf) continue; if (mlo_sta && params->chan == chanctx_conf->def.chan && ether_addr_equal(sdata->vif.addr, mgmt->sa)) { link_id = i; break; } if (ether_addr_equal(conf->addr, mgmt->sa)) break; chanctx_conf = NULL; } if (chanctx_conf) { need_offchan = params->chan && (params->chan != chanctx_conf->def.chan); } else { need_offchan = true; } rcu_read_unlock(); } if (need_offchan && !params->offchan) { ret = -EBUSY; goto out_unlock; } skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len); if (!skb) { ret = -ENOMEM; goto out_unlock; } skb_reserve(skb, local->hw.extra_tx_headroom); data = skb_put_data(skb, params->buf, params->len); /* Update CSA counters */ if (sdata->vif.bss_conf.csa_active && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->deflink.u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (beacon) for (i = 0; i < params->n_csa_offsets; i++) data[params->csa_offsets[i]] = beacon->cntdwn_current_counter; rcu_read_unlock(); } IEEE80211_SKB_CB(skb)->flags = flags; skb->dev = sdata->dev; if (!params->dont_wait_for_ack) { /* make a copy to preserve the frame contents * in case of encryption. */ ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_KERNEL); if (ret) { kfree_skb(skb); goto out_unlock; } } else { /* Assign a dummy non-zero cookie, it's not sent to * userspace in this case but we rely on its value * internally in the need_offchan case to distinguish * mgmt-tx from remain-on-channel. */ *cookie = 0xffffffff; } if (!need_offchan) { ieee80211_tx_skb_tid(sdata, skb, 7, link_id); ret = 0; goto out_unlock; } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; /* This will handle all kinds of coalescing and immediate TX */ ret = ieee80211_start_roc_work(local, sdata, params->chan, params->wait, cookie, skb, IEEE80211_ROC_TYPE_MGMT_TX); if (ret) ieee80211_free_txskb(&local->hw, skb); out_unlock: mutex_unlock(&local->mtx); return ret; } int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_local *local = wiphy_priv(wiphy); return ieee80211_cancel_roc(local, cookie, true); } void ieee80211_roc_setup(struct ieee80211_local *local) { INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start); INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done); INIT_DELAYED_WORK(&local->roc_work, ieee80211_roc_work); INIT_LIST_HEAD(&local->roc_list); } void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_roc_work *roc, *tmp; bool work_to_do = false; mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (sdata && roc->sdata != sdata) continue; if (roc->started) { if (local->ops->remain_on_channel) { /* can race, so ignore return value */ drv_cancel_remain_on_channel(local, roc->sdata); ieee80211_roc_notify_destroy(roc); } else { roc->abort = true; work_to_do = true; } } else { ieee80211_roc_notify_destroy(roc); } } if (work_to_do) __ieee80211_roc_work(local); mutex_unlock(&local->mtx); }
3142 3142 3142 577 3142 3142 3142 1295 27 27 3025 3025 3024 3024 3024 3024 3024 3024 3114 577 3024 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, unsigned int); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct request **cached_rq; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); void blk_mq_free_tags(struct blk_mq_tags *tags); int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } unsigned int blk_mq_in_flight(struct request_queue *q, struct block_device *part); void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_inc(&hctx->queue->nr_active_requests_shared_tags); else atomic_inc(&hctx->nr_active); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; if (rq->rq_flags & RQF_MQ_INFLIGHT) { rq->rq_flags &= ~RQF_MQ_INFLIGHT; __blk_mq_dec_active_requests(hctx); } } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->tag != BLK_MQ_NO_TAG && !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { hctx->tags->rqs[rq->tag] = rq; return true; } return __blk_mq_get_driver_tag(hctx, rq); } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* * blk_mq_plug() - Get caller context plug * @bio : the bio being submitted by the caller context * * Plugging, by design, may delay the insertion of BIOs into the elevator in * order to increase BIO merging opportunities. This however can cause BIO * insertion order to change from the order in which submit_bio() is being * executed in the case of multiple contexts concurrently issuing BIOs to a * device, even if these context are synchronized to tightly control BIO issuing * order. While this is not a problem with regular block devices, this ordering * change can cause write BIO failures with zoned block devices as these * require sequential write patterns to zones. Prevent this from happening by * ignoring the plug state of a BIO issuing context if it is for a zoned block * device and the BIO to plug is a write operation. * * Return current->plug if the bio can be plugged and NULL otherwise */ static inline struct blk_plug *blk_mq_plug( struct bio *bio) { /* Zoned block device write operation case: do not plug the BIO */ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) return NULL; /* * For regular block devices or read operations, use the context plug * which may be NULL if blk_start_plug() was not executed. */ return current->plug; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ #endif
3727 3728 3317 3319 3729 607 607 607 606 604 606 3717 3718 2748 2213 632 3730 3728 3727 2765 3729 3728 14 3714 3320 3731 3319 428 2231 2926 3758 2527 2527 2527 2527 2527 2527 2527 2533 2527 2527 2527 2532 2527 2406 4 4 3 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { switch (whence) { case SEEK_END: offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (offset == 0) return file->f_pos; /* * f_lock protects against read/modify/write race with other * SEEK_CURs. Note that parallel writes and reads behave * like SEEK_SET. */ spin_lock(&file->f_lock); offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) return -EBADF; retval = -EINVAL; if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } out_putf: fdput_pos(f); return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_read(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_write(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = call_read_iter(filp, &kiocb, iter); else ret = call_write_iter(filp, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) return ret; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, iter, pos, READ, flags); else ret = do_loop_readv_writev(file, iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = call_read_iter(file, iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { if (!file->f_op->read_iter) return -EINVAL; return do_iter_read(file, iter, ppos, flags); } EXPORT_SYMBOL(vfs_iter_read); static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) return ret; if (file->f_op->write_iter) ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); return ret; } ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = call_write_iter(file, iocb, iter); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { if (!file->f_op->write_iter) return -EINVAL; return do_iter_write(file, iter, ppos, flags); } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { ret = do_iter_read(file, &iter, pos, flags); kfree(iov); } return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { file_start_write(file); ret = do_iter_write(file, &iter, pos, flags); file_end_write(file); kfree(iov); } return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; in = fdget(in_fd); if (!in.file) goto out; if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) { pos = in.file->f_pos; } else { pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; } retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ retval = -EBADF; out = fdget(out_fd); if (!out.file) goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(out.file, true); if (!opipe) { retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); file_end_write(out.file); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); out.file->f_pos = out_pos; if (ppos) *ppos = pos; else in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; fput_out: fdput(out); fput_in: fdput(in); out: return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /** * generic_copy_file_range - copy data between two files * @file_in: file structure to read from * @pos_in: file offset to read from * @file_out: file structure to write data to * @pos_out: file offset to write data to * @len: amount of data to copy * @flags: copy flags * * This is a generic filesystem helper to copy data from one file to another. * It has no constraints on the source or destination file owners - the files * can belong to different superblocks and different filesystem types. Short * copies are allowed. * * This should be called from the @file_out filesystem, as per the * ->copy_file_range() method. * * Returns the number of bytes copied or a negative error indicating the * failure. */ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); return do_splice_direct(file_in, &pos_in, file_out, &pos_out, len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); } EXPORT_SYMBOL(generic_copy_file_range); /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); goto done; } if (!splice && file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); if (ret > 0) goto done; } /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. */ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); file_end_write(file_out); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; struct fd f_in; struct fd f_out; ssize_t ret = -EBADF; f_in = fdget(fd_in); if (!f_in.file) goto out2; f_out = fdget(fd_out); if (!f_out.file) goto out1; ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) goto out; } else { pos_in = f_in.file->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) goto out; } else { pos_out = f_out.file->f_pos; } ret = -EINVAL; if (flags != 0) goto out; ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { f_in.file->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { f_out.file->f_pos = pos_out; } } out: fdput(f_out); out1: fdput(f_in); out2: return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; }
1344 1344 1344 1130 1130 1130 1130 1130 1130 1130 1344 1344 55 1130 743 743 502 476 728 54 743 743 743 743 743 1241 354 1130 1130 1130 1130 1130 1130 1130 1130 1130 1130 1290 1290 1290 1290 815 1290 1290 1290 1290 961 893 952 956 1241 1241 1241 1241 1062 1062 743 1494 1495 1495 155 155 155 1155 1155 1155 1129 1135 1135 1129 1135 1135 1135 1135 1135 1135 1135 805 806 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 // SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */ #include <linux/kernel.h> #include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> #include <trace/events/writeback.h> #include "internal.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* * Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* * Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) #define RATELIMIT_CALC_SHIFT 10 /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; /* The following parameters are exported via /proc/sys/vm */ /* * Start background writeback (via writeback threads) at this percentage */ static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ static int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ EXPORT_SYMBOL_GPL(dirty_writeback_interval); /* * The longest time for which data is allowed to remain dirty */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; }; /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) #ifdef CONFIG_CGROUP_WRITEBACK #define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB .dom = &global_wb_domain #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; min = div64_ul(min, tot_bw); } if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ #define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB #define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ /** * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ nr_pages -= min(nr_pages, pgdat->totalreserve_pages); nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; int i; for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; unsigned long nr_pages; if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; if (!populated_zone(z)) continue; nr_pages = zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ nr_pages -= min(nr_pages, high_wmark_pages(z)); nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; } } /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** * global_dirtyable_memory - number of globally dirtyable pages * * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } /** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest * * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; /* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; /* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against * globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. */ if (bytes) ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); if (bg_bytes) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; } /** * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) { unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * node_memory / global_dirtyable_memory(); else dirty = vm_dirty_ratio * node_memory / 100; if (rt_task(tsk)) dirty += dirty / 4; return dirty; } /** * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) { unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } #ifdef CONFIG_SYSCTL static int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } static int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; } static int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } #endif static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } static void wb_domain_writeout_add(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac, long nr) { __fprop_add_percpu_max(&dom->completions, completions, max_prop_frac, nr); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); } } /* * Increment @wb's writeout completion count and the global writeout * completion count. Called from __folio_end_writeback(). */ static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { struct wb_domain *cgdom; wb_stat_mod(wb, WB_WRITTEN, nr); wb_domain_writeout_add(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac, nr); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac, nr); } void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); __wb_writeout_add(wb, 1); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(struct timer_list *t) { struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ dom->period_time = 0; } } int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); spin_lock_init(&dom->lock); timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; return fprop_global_init(&dom->completions, gfp); } #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. */ static unsigned int bdi_min_ratio; static int bdi_check_pages_limit(unsigned long pages) { unsigned long max_dirty_pages = global_dirtyable_memory(); if (pages > max_dirty_pages) return -EINVAL; return 0; } static unsigned long bdi_ratio_from_pages(unsigned long pages) { unsigned long background_thresh; unsigned long dirty_thresh; unsigned long ratio; global_dirty_limits(&background_thresh, &dirty_thresh); ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); return ratio; } static u64 bdi_get_bytes(unsigned int ratio) { unsigned long background_thresh; unsigned long dirty_thresh; u64 bytes; global_dirty_limits(&background_thresh, &dirty_thresh); bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; return bytes; } static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; int ret = 0; if (min_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; min_ratio *= BDI_RATIO_SCALE; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { if (min_ratio < bdi->min_ratio) { delta = bdi->min_ratio - min_ratio; bdi_min_ratio -= delta; bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { ret = -EINVAL; } } } spin_unlock_bh(&bdi_lock); return ret; } static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio); } int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio); } int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); } EXPORT_SYMBOL(bdi_set_max_ratio); u64 bdi_get_min_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->min_ratio); } int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) { int ret; unsigned long pages = min_bytes >> PAGE_SHIFT; unsigned long min_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; min_ratio = bdi_ratio_from_pages(pages); return __bdi_set_min_ratio(bdi, min_ratio); } u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); } int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) { int ret; unsigned long pages = max_bytes >> PAGE_SHIFT; unsigned long max_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; max_ratio = bdi_ratio_from_pages(pages); return __bdi_set_max_ratio(bdi, max_ratio); } int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) return -EINVAL; spin_lock_bh(&bdi_lock); if (strict_limit) bdi->capabilities |= BDI_CAP_STRICTLIMIT; else bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; spin_unlock_bh(&bdi_lock); return 0; } static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) { return max(thresh, dom->dirty_limit); } /* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) { struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); mdtc->avail = filepages + min(headroom, other_clean); } /** * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; u64 wb_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* * Calculate this BDI's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * * (o) global/bdi setpoints * * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * * if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 * * * * * * * * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * * (o) wb control line * * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> * wb_setpoint^ x_intercept^ * * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD * card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload */ static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; dtc->pos_ratio = 0; if (unlikely(dtc->dirty >= limit)) return; /* * global setpoint * * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } if (dtc->wb_dirty >= wb_thresh) return; wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); if (wb_setpoint == 0 || wb_setpoint == wb_thresh) return; wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, wb_thresh); /* * Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * wb's) while given strictlimit wb is below limit. * * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; } /* * We have computed basic pos_ratio above based on global situation. If * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* * wb setpoint * * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * * x_intercept - wb_dirty * := -------------------------- * x_intercept - wb_setpoint * * The main wb control line is a linear function that subjects to * * (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw * * For single wb case, the dirty pages are observed to fluctuate * regularly within range * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); wb_setpoint = setpoint * (u64)x >> 16; /* * Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * * wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh */ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; x_intercept = wb_setpoint + span; if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); else pos_ratio *= 8; } dtc->pos_ratio = pos_ratio; } static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period * * @written may have decreased due to folio_redirty_for_writepage(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { bw = div64_ul(bw, elapsed); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; unsigned long limit = dom->dirty_limit; /* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: dom->dirty_limit = limit; } static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(dtc); dom->dirty_limit_tstamp = now; } spin_unlock(&dom->lock); } /* * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed) { struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); /* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: * * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on wb counters * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; else setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; unsigned long elapsed; unsigned long dirtied; unsigned long written; spin_lock(&wb->list_lock); /* * Lockless checks for elapsed time are racy and delayed update after * IO completion doesn't do it at all (to make sure written pages are * accounted reasonably quickly). Make sure elapsed >= 1 to avoid * division errors. */ elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); if (update_ratelimit) { domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); wb->dirtied_stamp = dirtied; wb->written_stamp = written; WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; __wb_update_bandwidth(&gdtc, NULL, false); } /* Interval after which we consider wb idle and don't estimate bandwidth */ #define WB_BANDWIDTH_IDLE_JIF (HZ) static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) { unsigned long now = jiffies; unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); if (elapsed > WB_BANDWIDTH_IDLE_JIF && !atomic_read(&wb->writeback_inodes)) { spin_lock(&wb->list_lock); wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); wb->written_stamp = wb_stat(wb, WB_WRITTEN); WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } } /* * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ /* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) t += (hi - lo) * (10 * HZ) / 1024; /* * This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. * * We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); /* * Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. */ if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } *nr_dirtied_pause = pages; /* * The minimal pause time will normally be half the target pause time. */ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long wb_reclaimable; /* * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static int balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; int ret = 0; for (;;) { unsigned long now = jiffies; unsigned long dirty, thresh, bg_thresh; unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { wb_dirty_limits(gdtc); dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; bg_thresh = gdtc->wb_bg_thresh; } else { dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; } if (mdtc) { unsigned long filepages, headroom, writeback; /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc->dirty += writeback; mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } } /* * In laptop mode, we wait until hitting the higher threshold * before starting background writeout, and then write out all * the way down to the lower threshold. So slow writers cause * minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the wb limits are ramping up in case of !strictlimit. * * In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. * * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv; unsigned long m_intv; free_running: intv = dirty_poll_interval(dirty, thresh); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); break; } /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ if (!strictlimit) { wb_dirty_limits(gdtc); if ((current->flags & PF_LOCAL_THROTTLE) && gdtc->wb_dirty < dirty_freerun_ceiling(gdtc->wb_thresh, gdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be throttled * when below the per-wb freerun ceiling. */ goto free_running; } dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); wb_position_ratio(gdtc); sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) { wb_dirty_limits(mdtc); if ((current->flags & PF_LOCAL_THROTTLE) && mdtc->wb_dirty < dirty_freerun_ceiling(mdtc->wb_thresh, mdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be * throttled when below the per-wb * freerun ceiling. */ goto free_running; } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } if (dirty_exceeded != wb->dirty_exceeded) wb->dirty_exceeded = dirty_exceeded; if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); if (flags & BDP_ASYNC) { ret = -EAGAIN; break; } __set_current_state(TASK_KILLABLE); wb->dirty_sleep = now; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponsive NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the consumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); /* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. * @mapping: address_space which was dirtied. * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * See balance_dirty_pages_ratelimited() for details. * * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to * indicate that memory is out of balance and the caller must wait * for I/O to complete. Otherwise, it will return 0 to indicate * that either memory was already in balance, or it was able to sleep * until the amount of dirty memory returned to balance. */ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; int ret = 0; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; ratelimit = current->nr_dirtied_pause; if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; } /* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); return ret; } EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); /** * balance_dirty_pages_ratelimited - balance dirty memory state. * @mapping: address_space which was dirtied. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * Once we're over the dirty memory limit we decrease the ratelimiting * by a lot, to prevent individual processes from overshooting the limit * by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. * * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; unsigned long reclaimable; unsigned long thresh; /* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) return true; thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); if (thresh < 2 * wb_stat_error()) reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); else reclaimable = wb_stat(wb, WB_RECLAIMABLE); if (reclaimable > thresh) return true; if (mdtc) { unsigned long filepages, headroom, writeback; mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); if (thresh < 2 * wb_stat_error()) reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); else reclaimable = wb_stat(wb, WB_RECLAIMABLE); if (reclaimable > thresh) return true; } return false; } #ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; ret = proc_dointvec(table, write, buffer, length, ppos); /* * Writing 0 to dirty_writeback_interval will disable periodic writeback * and a different non-zero value will wakeup the writeback threads. * wb_wakeup_delayed() would be more appropriate, but it's a pain to * iterate over all bdis and wbs. * The reason we do this is to make the change take effect immediately. */ if (!ret && write && dirty_writeback_interval && dirty_writeback_interval != old_interval) wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; } #endif void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = from_timer(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ void laptop_io_completion(struct backing_dev_info *info) { mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory * thresholds. */ void writeback_set_ratelimit(void) { struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } static int page_writeback_cpu_online(unsigned int cpu) { writeback_set_ratelimit(); return 0; } #ifdef CONFIG_SYSCTL /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", .data = &dirty_background_bytes, .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", .data = &vm_dirty_bytes, .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, .proc_handler = dirty_writeback_centisecs_handler, }, { .procname = "dirty_expire_centisecs", .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_HIGHMEM { .procname = "highmem_is_dirtyable", .data = &vm_highmem_is_dirtyable, .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "laptop_mode", .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, {} }; #endif /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory, and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. */ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", vm_page_writeback_sysctls); #endif } /** * tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); unsigned int tagged = 0; void *page; xas_lock_irq(&xas); xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); if (++tagged % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. * * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). * * To avoid deadlocks between range_cyclic writeback and callers that hold * pages in PageWriteback to aggregate IO until write_cache_pages() returns, * we do not loop back to the start of the file. Doing so causes a page * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. * * Return: %0 on success, negative error code otherwise */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; int error; struct folio_batch fbatch; int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag_pages_for_writeback(mapping, index, end); tag = PAGECACHE_TAG_TOWRITE; } else { tag = PAGECACHE_TAG_DIRTY; } done_index = index; while (!done && (index <= end)) { int i; nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; unsigned long nr; done_index = folio->index; folio_lock(folio); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data integrity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) folio_wait_writeback(folio); else goto continue_unlock; } BUG_ON(folio_test_writeback(folio)); if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); error = writepage(folio, wbc, data); nr = folio_nr_pages(folio); if (unlikely(error)) { /* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { folio_unlock(folio); error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; done_index = folio->index + nr; done = 1; break; } if (!ret) ret = error; } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ wbc->nr_to_write -= nr; if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } folio_batch_release(&fbatch); cond_resched(); } /* * If we hit the last page and there is more work to be done: wrap * back the index back to the start of the file for the next * time we are called. */ if (wbc->range_cyclic && !done) done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; } EXPORT_SYMBOL(write_cache_pages); static int writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(&folio->page, wbc); mapping_set_error(mapping, ret); return ret; } int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) { ret = mapping->a_ops->writepages(mapping, wbc); } else if (mapping->a_ops->writepage) { struct blk_plug plug; blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, writepage_cb, mapping); blk_finish_plug(&plug); } else { /* deal with chardevs and other special files */ ret = 0; } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; /* * Lacking an allocation context or the locality or writeback * state of any of the inode's pages, throttle based on * writeback activity on the local node. It's as good a * guess as any. */ reclaim_throttle(NODE_DATA(numa_node_id()), VMSCAN_THROTTLE_WRITEBACK); } /* * Usually few pages are written by now from those we've just submitted * but if there's constant writeback being submitted, this makes sure * writeback bandwidth is updated once in a while. */ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) wb_update_bandwidth(wb); return ret; } /* * For address_spaces which do not use buffers nor write back. */ bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return !folio_test_set_dirty(folio); return false; } EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * * Caller must hold folio_memcg_lock(). * * NOTE: This relies on being atomic wrt interrupts. */ static void folio_account_dirtied(struct folio *folio, struct address_space *mapping) { struct inode *inode = mapping->host; trace_writeback_dirty_folio(folio, mapping); if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; long nr = folio_nr_pages(folio); inode_attach_wb(inode, folio); wb = inode_to_wb(inode); __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); wb_stat_mod(wb, WB_DIRTIED, nr); task_io_account_write(nr * PAGE_SIZE); current->nr_dirtied += nr; __this_cpu_add(bdp_ratelimits, nr); mem_cgroup_track_foreign_dirty(folio, wb); } } /* * Helper function for deaccounting dirty page without writeback. * * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); task_io_account_cancelled_write(nr * PAGE_SIZE); } /* * Mark the folio dirty, and set it dirty in the page cache, and mark * the inode dirty. * * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * * The caller must hold folio_memcg_lock(). Most callers have the folio * locked. A few have the folio blocked from truncation through other * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I * cannot prove is always protected against truncate. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) { unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); __xa_set_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } /** * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. * @mapping: Address space this folio belongs to. * @folio: Folio to be marked as dirty. * * Filesystems which do not use buffer heads should call this function * from their set_page_dirty address space operation. It ignores the * contents of folio_get_private(), so if the filesystem marks individual * blocks as dirty, the filesystem should handle that itself. * * This is also sometimes used by filesystems which use buffer_heads when * a single buffer is being dirtied: we want to set the folio dirty in * that case, but not all the buffers. This is a "bottom-up" dirtying, * whereas block_dirty_folio() is a "top-down" dirtying. * * The caller must ensure this doesn't race with truncation. Most will * simply hold the folio lock, but e.g. zap_pte_range() calls with the * folio mapped and the pte lock held, which also locks out truncation. */ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { folio_memcg_lock(folio); if (folio_test_set_dirty(folio)) { folio_memcg_unlock(folio); return false; } __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); folio_memcg_unlock(folio); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return true; } EXPORT_SYMBOL(filemap_dirty_folio); /** * folio_redirty_for_writepage - Decline to write a dirty folio. * @wbc: The writeback control. * @folio: The folio. * * When a writepage implementation decides that it doesn't want to write * @folio for some reason, it should call this function, unlock @folio and * return 0. * * Return: True if we redirtied the folio. False if someone else dirtied * it first. */ bool folio_redirty_for_writepage(struct writeback_control *wbc, struct folio *folio) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); bool ret; wbc->pages_skipped += nr; ret = filemap_dirty_folio(mapping, folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied -= nr; node_stat_mod_folio(folio, NR_DIRTIED, -nr); wb_stat_mod(wb, WB_DIRTIED, -nr); unlocked_inode_to_wb_end(inode, &cookie); } return ret; } EXPORT_SYMBOL(folio_redirty_for_writepage); /** * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * * The folio may not be truncated while this function is running. * Holding the folio lock is sufficient to prevent truncation, but some * callers cannot acquire a sleeping lock. These callers instead hold * the page table lock for a page table which contains at least one page * in this folio. Truncation will block on the page table lock as it * unmaps pages before removing the folio from its mapping. * * Return: True if the folio was newly dirtied, false if it was already dirty. */ bool folio_mark_dirty(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (likely(mapping)) { /* * readahead/folio_deactivate could remain * PG_readahead/PG_reclaim due to race with folio_end_writeback * About readahead, if the folio is written, the flags would be * reset. So no problem. * About folio_deactivate, if the folio is redirtied, * the flag will be reset. So no problem. but if the * folio is used by readahead it will confuse readahead * and make it restart the size rampup process. But it's * a trivial problem. */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); return mapping->a_ops->dirty_folio(mapping, folio); } return noop_dirty_folio(mapping, folio); } EXPORT_SYMBOL(folio_mark_dirty); /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ void __folio_cancel_dirty(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; folio_memcg_lock(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); folio_memcg_unlock(folio); } else { folio_clear_dirty(folio); } } EXPORT_SYMBOL(__folio_cancel_dirty); /* * Clear a folio's dirty flag, while caring for dirty memory accounting. * Returns true if the folio was previously dirty. * * This is for preparing to put the folio under writeout. We leave * the folio tagged as dirty in the xarray so that a concurrent * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. * The ->writepage implementation will run either folio_start_writeback() * or folio_mark_dirty(), at which stage we bring the folio's dirty flag * and xarray dirty tag back into sync. * * This incoherency between the folio's dirty flag and xarray tag is * unfortunate, but it only exists while the folio is locked. */ bool folio_clear_dirty_for_io(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); bool ret = false; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole folio dirty if it was * dirty in a pagetable. Only to then * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ if (folio_mkclean(folio)) folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against * installing a dirty pte and marking the folio dirty * at this point. We do this by having them hold the * page lock while dirtying the folio, and folios are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); ret = true; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } return folio_test_clear_dirty(folio); } EXPORT_SYMBOL(folio_clear_dirty_for_io); static void wb_inode_writeback_start(struct bdi_writeback *wb) { atomic_inc(&wb->writeback_inodes); } static void wb_inode_writeback_end(struct bdi_writeback *wb) { unsigned long flags; atomic_dec(&wb->writeback_inodes); /* * Make sure estimate of writeback throughput gets updated after * writeback completed. We delay the update by BANDWIDTH_INTERVAL * (which is the interval other bandwidth updates use for batching) so * that if multiple inodes end writeback at a similar time, they get * batched into one bandwidth update. */ spin_lock_irqsave(&wb->work_lock, flags); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); spin_unlock_irqrestore(&wb->work_lock, flags); } bool __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); bool ret; folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_test_clear_writeback(folio); if (ret) { __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, -nr); __wb_writeout_add(wb, nr); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) wb_inode_writeback_end(wb); } } if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = folio_test_clear_writeback(folio); } if (ret) { lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); } folio_memcg_unlock(folio); return ret; } bool __folio_start_writeback(struct folio *folio, bool keep_write) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); bool ret; int access_ret; folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xas_lock_irqsave(&xas, flags); xas_load(&xas); ret = folio_test_set_writeback(folio); if (!ret) { bool on_wblist; on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, nr); if (!on_wblist) wb_inode_writeback_start(wb); } /* * We can come through here when swapping * anonymous folios, so we don't necessarily * have an inode to track for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); } if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { ret = folio_test_set_writeback(folio); } if (!ret) { lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); } folio_memcg_unlock(folio); access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_FOLIO(access_ret != 0, folio); return ret; } EXPORT_SYMBOL(__folio_start_writeback); /** * folio_wait_writeback - Wait for a folio to finish writeback. * @folio: The folio to wait for. * * If the folio is currently being written back to storage, wait for the * I/O to complete. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. */ void folio_wait_writeback(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); folio_wait_bit(folio, PG_writeback); } } EXPORT_SYMBOL_GPL(folio_wait_writeback); /** * folio_wait_writeback_killable - Wait for a folio to finish writeback. * @folio: The folio to wait for. * * If the folio is currently being written back to storage, wait for the * I/O to complete or a fatal signal to arrive. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. * Return: 0 on success, -EINTR if we get a fatal signal while waiting. */ int folio_wait_writeback_killable(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); if (folio_wait_bit_killable(folio, PG_writeback)) return -EINTR; } return 0; } EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); /** * folio_wait_stable() - wait for writeback to finish, if necessary. * @folio: The folio to wait on. * * This function determines if the given folio is related to a backing * device that requires folio contents to be held stable during writeback. * If so, then it will wait for any pending writeback to complete. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. */ void folio_wait_stable(struct folio *folio) { if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable);
8 2757 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PUD_SHIFT PUD_SHIFT #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; #define hugepage_flags_enabled() \ (transparent_hugepage_flags & \ ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) #define hugepage_flags_always() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_FLAG)) /* * Do the below checks: * - For file vma, check if the linear page offset of vma is * HPAGE_PMD_NR aligned within the file. The hugepage is * guaranteed to be hugepage-aligned within the file, but we must * check that the PMD-aligned addresses in the VMA map to * PMD-aligned offsets within the file, else the hugepage will * not be PMD-mappable. * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE * area. */ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { unsigned long haddr; /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; } haddr = addr & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; if (!vma->vm_file) return false; inode = vma->vm_file->f_inode; return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && (vma->vm_flags & VM_EXEC) && !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } void deferred_split_folio(struct folio *folio); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test */ static inline bool folio_test_pmd_mappable(struct folio *folio) { return folio_order(folio) >= HPAGE_PMD_ORDER; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) static inline bool folio_test_pmd_mappable(struct folio *folio) { return false; } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { return false; } static inline bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, bool in_pf, bool enforce_sysfs) { return false; } static inline void folio_prep_large_rmappable(struct folio *folio) {} #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline bool can_split_folio(struct folio *folio, int *pextra_pins) { return false; } static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { return -EINVAL; } static inline int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { return -EINVAL; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; } static inline bool is_huge_zero_page(struct page *page) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { return split_huge_page_to_list(&folio->page, list); } static inline int split_folio(struct folio *folio) { return split_folio_to_list(folio, NULL); } /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to * false */ #ifndef arch_thp_swp_supported static inline bool arch_thp_swp_supported(void) { return true; } #endif #endif /* _LINUX_HUGE_MM_H */
4825 2431 1638 2312 2296 2617 4302 174 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline int ext4_es_is_delonly(struct extent_status *es) { return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */
5402 2319 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MNT_IDMAPPING_H #define _LINUX_MNT_IDMAPPING_H #include <linux/types.h> #include <linux/uidgid.h> struct mnt_idmap; struct user_namespace; extern struct mnt_idmap nop_mnt_idmap; extern struct user_namespace init_user_ns; typedef struct { uid_t val; } vfsuid_t; typedef struct { gid_t val; } vfsgid_t; static_assert(sizeof(vfsuid_t) == sizeof(kuid_t)); static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); #ifdef CONFIG_MULTIUSER static inline uid_t __vfsuid_val(vfsuid_t uid) { return uid.val; } static inline gid_t __vfsgid_val(vfsgid_t gid) { return gid.val; } #else static inline uid_t __vfsuid_val(vfsuid_t uid) { return 0; } static inline gid_t __vfsgid_val(vfsgid_t gid) { return 0; } #endif static inline bool vfsuid_valid(vfsuid_t uid) { return __vfsuid_val(uid) != (uid_t)-1; } static inline bool vfsgid_valid(vfsgid_t gid) { return __vfsgid_val(gid) != (gid_t)-1; } static inline bool vfsuid_eq(vfsuid_t left, vfsuid_t right) { return vfsuid_valid(left) && __vfsuid_val(left) == __vfsuid_val(right); } static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right) { return vfsgid_valid(left) && __vfsgid_val(left) == __vfsgid_val(right); } /** * vfsuid_eq_kuid - check whether kuid and vfsuid have the same value * @vfsuid: the vfsuid to compare * @kuid: the kuid to compare * * Check whether @vfsuid and @kuid have the same values. * * Return: true if @vfsuid and @kuid have the same value, false if not. * Comparison between two invalid uids returns false. */ static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid) { return vfsuid_valid(vfsuid) && __vfsuid_val(vfsuid) == __kuid_val(kuid); } /** * vfsgid_eq_kgid - check whether kgid and vfsgid have the same value * @vfsgid: the vfsgid to compare * @kgid: the kgid to compare * * Check whether @vfsgid and @kgid have the same values. * * Return: true if @vfsgid and @kgid have the same value, false if not. * Comparison between two invalid gids returns false. */ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) { return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid); } /* * vfs{g,u}ids are created from k{g,u}ids. * We don't allow them to be created from regular {u,g}id. */ #define VFSUIDT_INIT(val) (vfsuid_t){ __kuid_val(val) } #define VFSGIDT_INIT(val) (vfsgid_t){ __kgid_val(val) } #define INVALID_VFSUID VFSUIDT_INIT(INVALID_UID) #define INVALID_VFSGID VFSGIDT_INIT(INVALID_GID) /* * Allow a vfs{g,u}id to be used as a k{g,u}id where we want to compare * whether the mapped value is identical to value of a k{g,u}id. */ #define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) } #define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) } int vfsgid_in_group_p(vfsgid_t vfsgid); vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid); vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid); kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid); kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid); /** * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid: vfsuid to be mapped * * Check whether @vfsuid has a mapping in the filesystem idmapping. Use this * function to check whether the filesystem idmapping has a mapping for * @vfsuid. * * Return: true if @vfsuid has a mapping in the filesystem, false if not. */ static inline bool vfsuid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { return uid_valid(from_vfsuid(idmap, fs_userns, vfsuid)); } static inline bool vfsuid_has_mapping(struct user_namespace *userns, vfsuid_t vfsuid) { return from_kuid(userns, AS_KUIDT(vfsuid)) != (uid_t)-1; } /** * vfsuid_into_kuid - convert vfsuid into kuid * @vfsuid: the vfsuid to convert * * This can be used when a vfsuid is committed as a kuid. * * Return: a kuid with the value of @vfsuid */ static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid) { return AS_KUIDT(vfsuid); } /** * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid: vfsgid to be mapped * * Check whether @vfsgid has a mapping in the filesystem idmapping. Use this * function to check whether the filesystem idmapping has a mapping for * @vfsgid. * * Return: true if @vfsgid has a mapping in the filesystem, false if not. */ static inline bool vfsgid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { return gid_valid(from_vfsgid(idmap, fs_userns, vfsgid)); } static inline bool vfsgid_has_mapping(struct user_namespace *userns, vfsgid_t vfsgid) { return from_kgid(userns, AS_KGIDT(vfsgid)) != (gid_t)-1; } /** * vfsgid_into_kgid - convert vfsgid into kgid * @vfsgid: the vfsgid to convert * * This can be used when a vfsgid is committed as a kgid. * * Return: a kgid with the value of @vfsgid */ static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) { return AS_KGIDT(vfsgid); } /** * mapped_fsuid - return caller's fsuid mapped according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsuid. A common example is initializing the i_uid field of * a newly allocated inode triggered by a creation event such as mkdir or * O_CREAT. Other examples include the allocation of quotas for a specific * user. * * Return: the caller's current fsuid mapped up according to @idmap. */ static inline kuid_t mapped_fsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { return from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(current_fsuid())); } /** * mapped_fsgid - return caller's fsgid mapped according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsgid. A common example is initializing the i_gid field of * a newly allocated inode triggered by a creation event such as mkdir or * O_CREAT. Other examples include the allocation of quotas for a specific * user. * * Return: the caller's current fsgid mapped up according to @idmap. */ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } bool check_fsmapping(const struct mnt_idmap *idmap, const struct super_block *sb); #endif /* _LINUX_MNT_IDMAPPING_H */
329 329 329 5 5 5 329 275 335 335 335 335 335 335 335 329 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/freezer.h> #include <trace/events/module.h> static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } static void umh_complete(struct subprocess_info *sub_info) { struct completion *comp = xchg(&sub_info->complete, NULL); /* * See call_usermodehelper_exec(). If xchg() returns NULL * we own sub_info, the UMH_KILLABLE caller has gone away * or the caller used UMH_NO_WAIT. */ if (comp) complete(comp); else call_usermodehelper_freeinfo(sub_info); } /* * This is the task which runs the usermode application */ static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; int retval; spin_lock_irq(&current->sighand->siglock); flush_signal_handlers(current, 1); spin_unlock_irq(&current->sighand->siglock); /* * Initial kernel threads share ther FS with init, in order to * get the init root directory. But we've now created a new * thread that is going to execve a user process and has its own * 'struct fs_struct'. Reset umask to the default. */ current->fs->umask = 0022; /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); new->cap_inheritable = cap_intersect(usermodehelper_inheritable, new->cap_inheritable); spin_unlock(&umh_sysctl_lock); if (sub_info->init) { retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); goto out; } } commit_creds(new); wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* * call_usermodehelper_exec_sync() will call umh_complete * if UHM_WAIT_PROC. */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) return 0; do_exit(0); } /* Handles UMH_WAIT_PROC. */ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); umh_complete(sub_info); } /* * We need to create the usermodehelper kernel thread from a task that is affine * to an optimized set of CPUs (or nohz housekeeping ones) such that they * inherit a widest affinity irrespective of call_usermodehelper() callers with * possibly reduced affinity (eg: per-cpu workqueues). We don't want * usermodehelper targets to contend a busy CPU. * * Unbound workqueues provide such wide affinity and allow to block on * UMH_WAIT_PROC requests without blocking pending request (up to some limit). * * Besides, workqueues provide the privilege level that caller might not have * to perform the usermodehelper request. * */ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; /* * Use CLONE_PARENT to reparent it to kthreadd; we do not * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); } } } /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). * Should always be manipulated under umhelper_sem acquired for write. */ static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); /* * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled * to become 'false'. */ static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); /* * Time to wait for running_helpers to become zero before the setting of * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) int usermodehelper_read_trylock(void) { DEFINE_WAIT(wait); int ret = 0; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_INTERRUPTIBLE); if (!usermodehelper_disabled) break; if (usermodehelper_disabled == UMH_DISABLED) ret = -EAGAIN; up_read(&umhelper_sem); if (ret) break; schedule(); try_to_freeze(); down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return ret; } EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); long usermodehelper_read_lock_wait(long timeout) { DEFINE_WAIT(wait); if (timeout < 0) return -EINVAL; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_UNINTERRUPTIBLE); if (!usermodehelper_disabled) break; up_read(&umhelper_sem); timeout = schedule_timeout(timeout); if (!timeout) break; down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return timeout; } EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. */ void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) { down_write(&umhelper_sem); usermodehelper_disabled = depth; wake_up(&usermodehelper_disabled_waitq); up_write(&umhelper_sem); } /** * __usermodehelper_disable - Prevent new helpers from being started. * @depth: New value to assign to usermodehelper_disabled. * * Set usermodehelper_disabled to @depth and wait for running helpers to exit. */ int __usermodehelper_disable(enum umh_disable_depth depth) { long retval; if (!depth) return -EINVAL; down_write(&umhelper_sem); usermodehelper_disabled = depth; up_write(&umhelper_sem); /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to * be zero at one point (it may be increased later, but that * doesn't matter). */ retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); if (retval) return 0; __usermodehelper_set_disable_depth(UMH_ENABLED); return -EAGAIN; } static void helper_lock(void) { atomic_inc(&running_helpers); smp_mb__after_atomic(); } static void helper_unlock(void) { if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } /** * call_usermodehelper_setup - prepare to call a usermode helper * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation * @init: an init function * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. * * The init function is used to customize the helper process prior to * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); if (!sub_info) goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); #ifdef CONFIG_STATIC_USERMODEHELPER sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; #else sub_info->path = path; #endif sub_info->argv = argv; sub_info->envp = envp; sub_info->cleanup = cleanup; sub_info->init = init; sub_info->data = data; out: return sub_info; } EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). * * Note: successful return value does not guarantee the helper was called at * all. You can't rely on sub_info->{init,cleanup} being called even for * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; if (!sub_info->path) { call_usermodehelper_freeinfo(sub_info); return -EINVAL; } helper_lock(); if (usermodehelper_disabled) { retval = -EBUSY; goto out; } /* * If there is no binary for us to call, then just return and get out of * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and * disable all call_usermodehelper() calls. */ if (strlen(sub_info->path) == 0) goto out; /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free * the data structure in case of UMH_NO_WAIT. */ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; if (wait & UMH_KILLABLE) { retval = wait_for_completion_state(&done, state | TASK_KILLABLE); if (!retval) goto wait_done; /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; /* * fallthrough; in case of -ERESTARTSYS now do uninterruptible * wait_for_completion_state(). Since umh_complete() shall call * complete() in a moment if xchg() above returned NULL, this * uninterruptible wait_for_completion_state() will not block * SIGKILL'ed processes for long. */ } wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; out: call_usermodehelper_freeinfo(sub_info); unlock: helper_unlock(); return retval; } EXPORT_SYMBOL(call_usermodehelper_exec); /** * call_usermodehelper() - prepare and start a usermode application * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; info = call_usermodehelper_setup(path, argv, envp, gfp_mask, NULL, NULL, NULL); if (info == NULL) return -ENOMEM; return call_usermodehelper_exec(info, wait); } EXPORT_SYMBOL(call_usermodehelper); #if defined(CONFIG_SYSCTL) static int proc_cap_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[2]; kernel_cap_t new_cap, *cap; int err; if (write && (!capable(CAP_SETPCAP) || !capable(CAP_SYS_MODULE))) return -EPERM; /* * convert from the global kernel_cap_t to the ulong array to print to * userspace if this is a read. * * Legacy format: capabilities are exposed as two 32-bit values */ cap = table->data; spin_lock(&umh_sysctl_lock); cap_array[0] = (u32) cap->val; cap_array[1] = cap->val >> 32; spin_unlock(&umh_sysctl_lock); t = *table; t.data = &cap_array; /* * actually read or write and array of ulongs from userspace. Remember * these are least significant 32 bits first */ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; new_cap.val = (u32)cap_array[0]; new_cap.val += (u64)cap_array[1] << 32; /* * Drop everything not in the new_cap (but don't add things) */ if (write) { spin_lock(&umh_sysctl_lock); *cap = cap_intersect(*cap, new_cap); spin_unlock(&umh_sysctl_lock); } return 0; } static struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, { .procname = "inheritable", .data = &usermodehelper_inheritable, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, { } }; static int __init init_umh_sysctls(void) { register_sysctl_init("kernel/usermodehelper", usermodehelper_table); return 0; } early_initcall(init_umh_sysctls); #endif /* CONFIG_SYSCTL */
28 32 9 9 3 3 3 32 1 1 1 1 1 1 28 3 3 3 2 1 9 2 2 257 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts, user_timeout; s32 remaining; start_ts = tcp_sk(sk)->retrans_stamp; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); } /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. * * Returns: Nothing (void) */ static void tcp_write_err(struct sock *sk) { WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); sk_error_report(sk); tcp_write_queue_purge(sk); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /** * tcp_out_of_resources() - Close socket if out of resources * @sk: pointer to current socket * @do_reset: send a last packet with reset flag * * Do not allow orphaned sockets to eat all our resources. * This is direct violation of TCP specs, but it is required * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * * Also close if our net namespace is exiting; in that case there is no * hope of ever communicating again since all netns interfaces are already * down (or about to be down), and we need to release our dst references, * which have been moved to the netns loopback interface, so the namespace * can finish exiting. This condition is only possible if we are a kernel * socket, as those do not hold references to the namespace. * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { struct tcp_sock *tp = tcp_sk(sk); int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ if (READ_ONCE(sk->sk_err_soft)) shift++; if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_done(sk); return 1; } return 0; } /** * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket * @sk: Pointer to the current socket. * @alive: bool, socket alive state */ static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (READ_ONCE(sk->sk_err_soft) && !alive) retries = 0; /* However, if socket sent something recently, select some safe * number of retries. 8 corresponds to >100 seconds with minimal * RTO of 200msec. */ if (retries == 0 && alive) retries = 8; return retries; } static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { const struct net *net = sock_net(sk); int mss; /* Black hole detection */ if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) return; if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } static unsigned int tcp_model_timeout(struct sock *sk, unsigned int boundary, unsigned int rto_base) { unsigned int linear_backoff_thresh, timeout; linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; return jiffies_to_msecs(timeout); } /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket * @boundary: max number of retransmissions * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { unsigned int start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; start_ts = tcp_sk(sk)->retrans_stamp; if (likely(timeout == 0)) { unsigned int rto_base = TCP_RTO_MIN; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) rto_base = tcp_timeout_init(sk); timeout = tcp_model_timeout(sk, boundary, rto_base); } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool expired = false, do_reset; int retry_until, max_retransmits; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; if (sk->sk_state == TCP_SYN_SENT) max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts); expired = icsk->icsk_retransmits >= max_retransmits; } else { if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); __dst_negative_advice(sk); } retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } if (!expired) expired = retransmits_timed_out(sk, retry_until, READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, icsk->icsk_retransmits, icsk->icsk_rto, (int)expired); if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } if (sk_rethink_txhash(sk)) { tp->timeout_rehash++; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); } return 0; } /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; /* Handling the sack compression case */ if (tp->compressed_ack) { tcp_mstamp_refresh(tp); tcp_sack_compress_send_ack(sk); return; } if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_mstamp_refresh(tp); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } } /** * tcp_delack_timer() - The TCP delayed ACK timeout handler * @t: Pointer to the timer. (gets casted to struct sock *) * * This function gets (indirectly) called when the kernel timer for a TCP packet * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. * * Returns: Nothing (void) */ static void tcp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); } static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; return; } /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as * long as the receiver continues to respond probes. We support this by * default and reset icsk_probes_out with incoming ACKs. But if the * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we * kill the socket when the retry count and the time exceeds the * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; } else { u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (user_timeout && (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= msecs_to_jiffies(user_timeout)) goto abort; } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) goto abort; if (tcp_out_of_resources(sk, true)) return; } if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } } /* * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_retries; req->rsk_ops->syn_ack_timeout(req); /* Add one more retry for fastopen. * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { tcp_write_err(sk); return; } /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ if (icsk->icsk_retransmits == 1) tcp_enter_loss(sk); /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error * returned from rtx_syn_ack() to make it more persistent like * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ inet_rtx_syn_ack(sk, req); req->num_timeout++; icsk->icsk_retransmits++; if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, req->timeout << req->num_timeout, TCP_RTO_MAX); } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); const int timeout = TCP_RTO_MAX * 2; u32 rcv_delta, rtx_delta; rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); return rtx_delta > timeout; } /** * tcp_retransmit_timer() - The TCP retransmit timeout handler * @sk: Pointer to the current socket. * * This function gets called when the kernel timer for a TCP packet * of this socket expires. * * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; struct sk_buff *skb; req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ return; } if (!tp->packets_out) return; skb = tcp_rtx_queue_head(sk); if (WARN_ON_ONCE(!skb)) return; tp->tlp_high_seq = 0; if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits * become zero probes, but we should not timeout this * connection. If the socket is an orphan, time it out, * we cannot allow such beasts to hang infinitely. */ struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb)); if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt, jiffies_to_msecs(jiffies - tp->rcv_tstamp), rtx_delta); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &sk->sk_v6_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt, jiffies_to_msecs(jiffies - tp->rcv_tstamp), rtx_delta); } #endif if (tcp_rtx_probe0_timed_out(sk, skb)) { tcp_write_err(sk); goto out; } tcp_enter_loss(sk); tcp_retransmit_skb(sk, skb, 1); __sk_dst_reset(sk); goto out_reset_timer; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || tp->sacked_out) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; } if (mib_idx) __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); icsk->icsk_retransmits++; if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_RESOURCE_PROBE_INTERVAL, TCP_RTO_MAX); goto out; } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ icsk->icsk_backoff++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is * used to reset timer, set to 0. Recalculate 'icsk_rto' as this * might be increased if the stream oscillates between thin and thick, * thus the old value might already be too high compared to the value * set by 'tcp_set_rto' in tcp_input.c which resets the rto without * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating * exponential backoff behaviour to avoid continue hammering * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), TCP_RTO_MAX); } else if (sk->sk_state != TCP_SYN_SENT || icsk->icsk_backoff > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { /* Use normal (exponential) backoff unless linear timeouts are * activated. */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); out:; } /* Called with bottom-half processing disabled. Called by tcp_write_timer() */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int event; if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !icsk->icsk_pending) return; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); return; } tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { case ICSK_TIME_REO_TIMEOUT: tcp_rack_reo_timeout(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); break; case ICSK_TIME_RETRANS: icsk->icsk_pending = 0; tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: icsk->icsk_pending = 0; tcp_probe_timer(sk); break; } } static void tcp_write_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); } void tcp_syn_ack_timeout(const struct request_sock *req) { struct net *net = read_pnet(&inet_rsk(req)->ireq_net); __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) inet_csk_delete_keepalive_timer(sk); } EXPORT_SYMBOL_GPL(tcp_set_keepalive); static void tcp_keepalive_timer (struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; /* Only process if socket is not in use. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } if (sk->sk_state == TCP_LISTEN) { pr_err("Hmm... keepalive on a LISTEN ???\n"); goto out; } tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } tcp_send_active_reset(sk, GFP_ATOMIC); goto death; } if (!sock_flag(sk, SOCK_KEEPOPEN) || ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ if ((user_timeout != 0 && elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; } if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, * try harder. */ elapsed = TCP_RESOURCE_PROBE_INTERVAL; } } else { /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ elapsed = keepalive_time_when(tp) - elapsed; } resched: inet_csk_reset_keepalive_timer (sk, elapsed); goto out; death: tcp_done(sk); out: bh_unlock_sock(sk); sock_put(sk); } static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); struct sock *sk = (struct sock *)tp; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; tcp_send_ack(sk); } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); return HRTIMER_NORESTART; } void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_SOFT); tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_SOFT); tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; }
26 1460 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_BIT_H #define _LINUX_WAIT_BIT_H /* * Linux wait-bit related types and methods: */ #include <linux/wait.h> struct wait_bit_key { void *flags; int bit_nr; unsigned long timeout; }; struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(void *word, int bit); int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); struct wait_queue_head *bit_waitqueue(void *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .entry = \ LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * There is a standard hashed waitqueue table for generic use. This * is the part of the hashtable's accessor API that waits on a bit. * For instance, if one were to have waiters on a bitflag, one would * call wait_on_bit() in threads waiting for the bit to clear. * One uses wait_on_bit() where one is waiting for the bit to clear, * but has no intention of setting it. * Returned value will be zero if the bit was cleared, or non-zero * if the process received a signal and the mode permitted wakeup * on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, mode); } /** * wait_on_bit_io - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared. This is similar to wait_on_bit(), but calls * io_schedule() instead of schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared, or non-zero * if the process received a signal and the mode permitted wakeup * on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, mode); } /** * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * * Use the standard hashed waitqueue table to wait for a bit * to be cleared. This is similar to wait_on_bit(), except also takes a * timeout parameter. * * Returned value will be zero if the bit was cleared before the * @timeout elapsed, or non-zero if the @timeout elapsed or process * received a signal and the mode permitted wakeup on that signal. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, mode, timeout); } /** * wait_on_bit_action - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared, and allow the waiting action to be specified. * This is like wait_on_bit() but allows fine control of how the waiting * is done. * * Returned value will be zero if the bit was cleared, or non-zero * if the process received a signal and the mode permitted wakeup * on that signal. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } /** * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * There is a standard hashed waitqueue table for generic use. This * is the part of the hashtable's accessor API that waits on a bit * when one intends to set it, for instance, trying to lock bitflags. * For instance, if one were to have waiters trying to set bitflag * and waiting for it to clear before setting it, one would call * wait_on_bit() in threads waiting to be able to set the bit. * One uses wait_on_bit_lock() where one is waiting for the bit to * clear with the intention of setting it, and when done, clearing it. * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); } /** * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared and then to atomically set it. This is similar * to wait_on_bit(), but calls io_schedule() instead of schedule() * for the actual waiting. * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); } /** * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared and then to set it, and allow the waiting action * to be specified. * This is like wait_on_bit() but allows fine control of how the waiting * is done. * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_head *__wq_head = __var_waitqueue(var); \ struct wait_bit_queue_entry __wbq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_var_entry(&__wbq_entry, var, \ exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(__wq_head, \ &__wbq_entry.wq_entry, \ state); \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(__wq_head, &__wbq_entry.wq_entry); \ __out: __ret; \ }) #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) #define wait_var_event(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event(var, condition); \ } while (0) #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_killable(var, condition); \ __ret; \ }) #define __wait_var_event_timeout(var, condition, timeout) \ ___wait_var_event(var, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #define __wait_var_event_interruptible(var, condition) \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_interruptible(var, condition); \ __ret; \ }) /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * * @bit: the bit of the word being waited on * @word: the word being waited on, a kernel virtual address * * You can use this helper if bitflags are manipulated atomically rather than * non-atomically under a lock. */ static inline void clear_and_wake_up_bit(int bit, void *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ smp_mb__after_atomic(); wake_up_bit(word, bit); } #endif /* _LINUX_WAIT_BIT_H */
484 489 389 29 25 153 188 185 25 26 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define TIPC_MEDIA_INFO_OFFSET 5 extern const int one_page_mtu; struct tipc_skb_cb { union { struct { struct sk_buff *tail; unsigned long nxt_retr; unsigned long retr_stamp; u32 bytes_read; u32 orig_member; u16 chain_imp; u16 ackers; u16 retr_cnt; } __packed; #ifdef CONFIG_TIPC_CRYPTO struct { struct tipc_crypto *rx; struct tipc_aead *last; u8 recurs; } tx_clone_ctx __packed; #endif } __packed; union { struct { u8 validated:1; #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; #define SKB_PROBING 1 #define SKB_GRACING 2 u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; u8 flags; }; u8 reserved; #ifdef CONFIG_TIPC_CRYPTO void *crypto_ctx; #endif } __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; /* struct tipc_gap_ack - TIPC Gap ACK block * @ack: seqno of the last consecutive packet in link deferdq * @gap: number of gap packets since the last ack * * E.g: * link deferdq: 1 2 3 4 10 11 13 14 15 20 * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> */ struct tipc_gap_ack { __be16 ack; __be16 gap; }; /* struct tipc_gap_ack_blks * @len: actual length of the record * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast * ones) * @start_index: starting index for "valid" broadcast Gap ACK blocks * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks * * 31 16 15 0 * +-------------+-------------+-------------+-------------+ * | bgack_cnt | ugack_cnt | len | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > bc gacks * : : : | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > uc gacks * : : : | * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; union { u8 ugack_cnt; u8 start_index; }; u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_is_syn(struct tipc_msg *m) { return msg_bits(m, 0, 17, 1); } static inline void msg_set_syn(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 17, 1, d); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline int msg_ack_required(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_ack_required(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline int msg_nagle_ack(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_nagle_ack(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); } static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) { msg_set_bits(m, 0, 18, 0x1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_direct(struct tipc_msg *m) { return msg_type(m) == TIPC_DIRECT_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline void msg_set_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 28, 0x1, 1); } static inline u32 msg_is_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 28, 0x1); } static inline void msg_set_last_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 27, 0x1, 1); } static inline u32 msg_is_last_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 27, 0x1); } static inline void msg_set_non_legacy(struct tipc_msg *m) { msg_set_bits(m, 1, 26, 0x1, 1); } static inline u32 msg_is_legacy(struct tipc_msg *m) { return !msg_bits(m, 1, 26, 0x1); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch * link peer session number */ static inline bool msg_dest_session_valid(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1); } static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) { msg_set_bits(m, 1, 16, 0x1, valid); } static inline u16 msg_dest_session(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_inner_hdr(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u16 msg_named_seqno(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* Crypto message types */ #define KEY_DISTR_MSG 0 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u32 msg_nof_fragms(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_fragm_no(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } /* Word 13 */ static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) { msg_set_word(m, 13, n); } static inline u32 msg_peer_net_hash(struct tipc_msg *m) { return msg_word(m, 13); } /* Word 14 */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } /* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno * @list: list to be dequeued from * @seqno: seqno of the expected msg * * returns skb dequeued from the list if its seqno is less than or equal to * the expected one, otherwise the skb is still hold * * Note: must be used with appropriate locks held only */ static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, u16 seqno) { struct sk_buff *skb = skb_peek(list); if (skb && less_eq(buf_seqno(skb), seqno)) { __skb_unlink(skb, list); return skb; } return NULL; } #endif
8275 8276 8271 1 8271 8267 20 8269 8276 8272 1 8270 8266 20 8255 6243 6245 6245 6245 6112 3 8 8 7 3 8 8 6 5 5 5 6 2 8 8 8 8254 8254 8250 4162 2714 6253 8 8 6245 2723 8251 8251 8252 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/realpath.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/magic.h> #include <linux/proc_fs.h> /** * tomoyo_encode2 - Encode binary string to ascii string. * * @str: String in binary format. * @str_len: Size of @str in byte. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode2(const char *str, int str_len) { int i; int len = 0; const char *p = str; char *cp; char *cp0; if (!p) return NULL; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') len += 2; else if (c > ' ' && c < 127) len++; else len += 4; } len++; /* Reserve space for appending "/". */ cp = kzalloc(len + 10, GFP_NOFS); if (!cp) return NULL; cp0 = cp; p = str; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } } return cp0; } /** * tomoyo_encode - Encode binary string to ascii string. * * @str: String in binary format. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode(const char *str) { return str ? tomoyo_encode2(str, strlen(str)) : NULL; } /** * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root. * * @path: Pointer to "struct path". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_dentry_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_local_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. */ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, const int buflen) { struct super_block *sb = dentry->d_sb; char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); if (IS_ERR(pos)) return pos; /* Convert from $PID to self if $PID is current thread. */ if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { pos = ep - 5; if (pos < buffer) goto out; memmove(pos, "/self", 5); } goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation. */ if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ { char name[64]; int name_len; const dev_t dev = sb->s_dev; name[sizeof(name) - 1] = '\0'; snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev), MINOR(dev)); name_len = strlen(name); pos -= name_len; if (pos < buffer) goto out; memmove(pos, name, name_len); return pos; } /* Prepend filesystem name. */ prepend_filesystem_name: { const char *name = sb->s_type->name; const int name_len = strlen(name); pos -= name_len + 1; if (pos < buffer) goto out; memmove(pos, name, name_len); pos[name_len] = ':'; } return pos; out: return ERR_PTR(-ENOMEM); } /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * * @path: Pointer to "struct path". * * Returns the realpath of the given @path on success, NULL otherwise. * * If dentry is a directory, trailing '/' is appended. * Characters out of 0x20 < c < 0x7F range are converted to * \ooo style octal string. * Character \ is converted to \\ string. * * These functions use kzalloc(), so the caller must call kfree() * if these functions didn't return NULL. */ char *tomoyo_realpath_from_path(const struct path *path) { char *buf = NULL; char *name = NULL; unsigned int buf_len = PAGE_SIZE / 2; struct dentry *dentry = path->dentry; struct super_block *sb = dentry->d_sb; while (1) { char *pos; struct inode *inode; buf_len <<= 1; kfree(buf); buf = kmalloc(buf_len, GFP_NOFS); if (!buf) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; } inode = d_backing_inode(sb->s_root); /* * Get local name for filesystems without rename() operation */ if ((!inode->i_op->rename && !(sb->s_type->fs_flags & FS_REQUIRES_DEV))) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ else { pos = tomoyo_get_absolute_path(path, buf, buf_len - 1); /* * Fall back to local name if absolute name is not * available. */ if (pos == ERR_PTR(-EINVAL)) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); } encode: if (IS_ERR(pos)) continue; name = tomoyo_encode(pos); break; } kfree(buf); if (!name) tomoyo_warn_oom(__func__); return name; } /** * tomoyo_realpath_nofollow - Get realpath of a pathname. * * @pathname: The pathname to solve. * * Returns the realpath of @pathname on success, NULL otherwise. */ char *tomoyo_realpath_nofollow(const char *pathname) { struct path path; if (pathname && kern_path(pathname, 0, &path) == 0) { char *buf = tomoyo_realpath_from_path(&path); path_put(&path); return buf; } return NULL; }
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct debug_req_info { struct ethnl_req_info base; }; struct debug_reply_data { struct ethnl_reply_data base; u32 msg_mask; }; #define DEBUG_REPDATA(__reply_base) \ container_of(__reply_base, struct debug_reply_data, base) const struct nla_policy ethnl_debug_get_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_msglevel) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->msg_mask = dev->ethtool_ops->get_msglevel(dev); ethnl_ops_complete(dev); return 0; } static int debug_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } static int debug_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; static int ethnl_set_debug_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP; } static int ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; u32 msg_mask; int ret; msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) return ret; dev->ethtool_ops->set_msglevel(dev, msg_mask); return 1; } const struct ethnl_request_ops ethnl_debug_request_ops = { .request_cmd = ETHTOOL_MSG_DEBUG_GET, .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, .hdr_attr = ETHTOOL_A_DEBUG_HEADER, .req_info_size = sizeof(struct debug_req_info), .reply_data_size = sizeof(struct debug_reply_data), .prepare_data = debug_prepare_data, .reply_size = debug_reply_size, .fill_reply = debug_fill_reply, .set_validate = ethnl_set_debug_validate, .set = ethnl_set_debug, .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF, };
3 2 2 1 3 3 2 2 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162