7 7 6 1 7 13 13 1 13 1 12 11 1 10 3 7 7 7 7 7 3 143 143 144 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 fragment reassembly * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: net/ipv4/ip_fragment.c */ /* * Fixes: * Andi Kleen Make it work with multiple hosts. * More RFC compliance. * * Horst von Brand Add missing #include <linux/string.h> * Alexey Kuznetsov SMP races, threading, cleanup. * Patrick McHardy LRU queue of frag heads for evictor. * Mitsuru KANDA @USAGI Register inet6_protocol{}. * David Stevens and * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/tcp.h> #include <linux/udp.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ipv6_frag.h> #include <net/inet_ecn.h> static const char ip6_frag_cache_name[] = "ip6-frags"; static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, .daddr = hdr->daddr, .user = IP6_DEFRAG_LOCAL_DELIVER, .iif = iif, }; struct inet_frag_queue *q; if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL))) key.iif = 0; q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; return container_of(q, struct frag_queue, q); } static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset) { struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; struct sk_buff *prev_tail; struct net_device *dev; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (fq->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } err = -EINVAL; offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); /* note that if prob_offset is set, the skb is freed elsewhere, * we do not free it here. */ return -1; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto discard_fq; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. */ if (end & 0x7) { /* RFC2460 says always send parameter problem in * this case. -DaveM */ *prob_offset = offsetof(struct ipv6hdr, payload_len); return -1; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto discard_fq; fq->q.len = end; } } if (end == offset) goto discard_fq; err = -ENOMEM; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) goto discard_fq; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_fq; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto insert_error; if (dev) fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.mono_delivery_time = skb->mono_delivery_time; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) fq->q.max_size = fragsize; /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; fq->q.flags |= INET_FRAG_FIRST_IN; } if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip6_frag_reasm(fq, skb, prev_tail, dev); skb->_skb_refdst = orefdst; return err; } skb_dst_drop(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: inet_frag_kill(&fq->q); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } /* * Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; u8 ecn; inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) goto out_fail; reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; payload_len = ((skb->data - skb_network_header(skb)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) goto out_oversize; /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ nhoff = fq->nhoffset; skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; memmove(skb->head + sizeof(struct frag_hdr), skb->head, (skb->data - skb->head) - sizeof(struct frag_hdr)); if (skb_mac_header_was_set(skb)) skb->mac_header += sizeof(struct frag_hdr); skb->network_header += sizeof(struct frag_hdr); skb_reset_transport_header(skb); inet_frag_reasm_finish(&fq->q, skb, reasm_data, true); skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->nhoff = nhoff; IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; IP6CB(skb)->frag_max_size = fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oversize: net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len); goto out_fail; out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); inet_frag_kill(&fq->q); return -1; } static int ipv6_frag_rcv(struct sk_buff *skb) { struct frag_hdr *fhdr; struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); u8 nexthdr; int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ if (hdr->payload_len == 0) goto fail_hdr; if (!pskb_may_pull(skb, (skb_transport_offset(skb) + sizeof(struct frag_hdr)))) goto fail_hdr; hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); if (!(fhdr->frag_off & htons(IP6_OFFSET | IP6_MF))) { /* It is not a fragmented frame */ skb->transport_header += sizeof(struct frag_hdr); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; IP6CB(skb)->frag_max_size = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr); return 1; } /* RFC 8200, Section 4.5 Fragment Header: * If the first fragment does not include all headers through an * Upper-Layer header, then that fragment should be discarded and * an ICMP Parameter Problem, Code 3, message should be sent to * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); return -1; } iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; int ret; spin_lock(&fq->q.lock); fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, &prob_offset); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); /* icmpv6_param_prob() calls kfree_skb(skb) */ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); } return ret; } __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; fail_hdr: __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } static const struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, .flags = INET6_PROTO_NOPOLICY, }; #ifdef CONFIG_SYSCTL static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ip6frag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ip6frag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; /* secret interval has been deprecated */ static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip6_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv6.fqdir->high_thresh; table[0].extra1 = &net->ipv6.fqdir->low_thresh; table[1].data = &net->ipv6.fqdir->low_thresh; table[1].extra2 = &net->ipv6.fqdir->high_thresh; table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(ip6_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *ip6_ctl_header; static int ip6_frags_sysctl_register(void) { ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6", ip6_frags_ctl_table); return ip6_ctl_header == NULL ? -ENOMEM : 0; } static void ip6_frags_sysctl_unregister(void) { unregister_net_sysctl_table(ip6_ctl_header); } #else static int ip6_frags_ns_sysctl_register(struct net *net) { return 0; } static void ip6_frags_ns_sysctl_unregister(struct net *net) { } static int ip6_frags_sysctl_register(void) { return 0; } static void ip6_frags_sysctl_unregister(void) { } #endif static int __net_init ipv6_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; res = ip6_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(net->ipv6.fqdir); return res; } static void __net_exit ipv6_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv6.fqdir); } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { .init = ipv6_frags_init_net, .pre_exit = ipv6_frags_pre_exit_net, .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = ip6frag_key_hashfn, .obj_hashfn = ip6frag_obj_hashfn, .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; int __init ipv6_frag_init(void) { int ret; ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frags_cache_name = ip6_frag_cache_name; ip6_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&ip6_frags); if (ret) goto out; ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); if (ret) goto err_protocol; ret = ip6_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&ip6_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: ip6_frags_sysctl_unregister(); err_sysctl: inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); err_protocol: inet_frags_fini(&ip6_frags); goto out; } void ipv6_frag_exit(void) { ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); inet_frags_fini(&ip6_frags); }
9 5 89 10 13 25660 59 22 13 1529 74 71 44 1356 1351 130 370 1603 149 937 47 323 1932 61 52 1091 43 51 82 449 83 1228 17 88 41 185 63 118 11 98 6585 3059 317 1335 105 295 89 629 79 98 399 30 610 7046 981 255 447 389 383 150 239 61 13 227 1992 11 50 3 10 100 6 2893 7687 9105 3435 9573 1687 1487 61 103 377 49 66 35 3588 539 814 35 7 4 1503 71 495 38 2 25 23 112 57 13 13 6 4603 26 43 16 11 51 101 18 38 78 350 364 35 218 57 65 18 5 35 34 35 36 8 40 55 15 1 79 61 12806 5535 1 713 1 14 17 14 225 2282 2099 163 1203 1211 165 200 17777 2464 98 68 2507 5093 206 13014 1 13295 2845 1063 74 1953 113 35 63 35 43 9 45 2 52 8 92 692 38 175 12 34 13 4 21 8 219 282 290 837 8 8647 287 2126 283 4449 2260 245 9 41 38 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Security Module Hook declarations. * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2001 James Morris <jmorris@intercode.com.au> * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group) * Copyright (C) 2015 Intel Corporation. * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com> * Copyright (C) 2016 Mellanox Techonologies * Copyright (C) 2020 Google LLC. */ /* * The macro LSM_HOOK is used to define the data structures required by * the LSM framework using the pattern: * * LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...) * * struct security_hook_heads { * #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME; * #include <linux/lsm_hook_defs.h> * #undef LSM_HOOK * }; */ LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr) LSM_HOOK(int, 0, binder_transaction, const struct cred *from, const struct cred *to) LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from, const struct cred *to) LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from, const struct cred *to, const struct file *file) LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child, unsigned int mode) LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent) LSM_HOOK(int, 0, capget, const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, const struct super_block *sb) LSM_HOOK(int, 0, quota_on, struct dentry *dentry) LSM_HOOK(int, 0, syslog, int type) LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages) LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, const struct linux_binprm *bprm) LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference) LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, struct fs_context *src_sc) LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, struct fs_parameter *param) LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts) LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts) LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts) LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts) LSM_HOOK(int, 0, sb_kern_mount, const struct super_block *sb) LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb) LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry) LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) LSM_HOOK(int, 0, sb_umount, struct vfsmount *mnt, int flags) LSM_HOOK(int, 0, sb_pivotroot, const struct path *old_path, const struct path *new_path) LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) #ifdef CONFIG_SECURITY_PATH LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) LSM_HOOK(int, 0, path_truncate, const struct path *path) LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name) LSM_HOOK(int, 0, path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, path_rename, const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode) LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid) LSM_HOOK(int, 0, path_chroot, const struct path *path) #endif /* CONFIG_SECURITY_PATH */ /* Needed for inode based security check */ LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask, unsigned int obj_type) LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode) LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode) LSM_HOOK(int, -EOPNOTSUPP, inode_init_security, struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode, const struct qstr *name, const struct inode *context_inode) LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry) LSM_HOOK(int, 0, inode_symlink, struct inode *dir, struct dentry *dentry, const char *old_name) LSM_HOOK(int, 0, inode_mkdir, struct inode *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, inode_rmdir, struct inode *dir, struct dentry *dentry) LSM_HOOK(int, 0, inode_mknod, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) LSM_HOOK(int, 0, inode_rename, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry) LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, bool rcu) LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr) LSM_HOOK(int, 0, inode_getattr, const struct path *path) LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap, struct dentry *dentry) LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer, size_t buffer_size) LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid) LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new) LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, const char *name) LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, struct kernfs_node *kn) LSM_HOOK(int, 0, file_permission, struct file *file, int mask) LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd) LSM_HOOK(int, 0, file_fcntl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(void, LSM_RET_VOID, file_set_fowner, struct file *file) LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk, struct fown_struct *fown, int sig) LSM_HOOK(int, 0, file_receive, struct file *file) LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, unsigned long clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new, const struct cred *old) LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid) LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid) LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode) LSM_HOOK(int, 0, kernel_module_request, char *kmod_name) LSM_HOOK(int, 0, kernel_load_data, enum kernel_load_data_id id, bool contents) LSM_HOOK(int, 0, kernel_post_load_data, char *buf, loff_t size, enum kernel_load_data_id id, char *description) LSM_HOOK(int, 0, kernel_read_file, struct file *file, enum kernel_read_file_id id, bool contents) LSM_HOOK(int, 0, kernel_post_read_file, struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old, int flags) LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old, int flags) LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old) LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, struct task_struct *p, u32 *secid) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio) LSM_HOOK(int, 0, task_getioprio, struct task_struct *p) LSM_HOOK(int, 0, task_prlimit, const struct cred *cred, const struct cred *tcred, unsigned int flags) LSM_HOOK(int, 0, task_setrlimit, struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) LSM_HOOK(int, 0, task_setscheduler, struct task_struct *p) LSM_HOOK(int, 0, task_getscheduler, struct task_struct *p) LSM_HOOK(int, 0, task_movememory, struct task_struct *p) LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p, struct inode *inode) LSM_HOOK(int, 0, userns_create, const struct cred *cred) LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag) LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp, u32 *secid) LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg) LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg) LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, msg_queue_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, msg_queue_associate, struct kern_ipc_perm *perm, int msqflg) LSM_HOOK(int, 0, msg_queue_msgctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, msg_queue_msgsnd, struct kern_ipc_perm *perm, struct msg_msg *msg, int msqflg) LSM_HOOK(int, 0, msg_queue_msgrcv, struct kern_ipc_perm *perm, struct msg_msg *msg, struct task_struct *target, long type, int mode) LSM_HOOK(int, 0, shm_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, shm_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, shm_associate, struct kern_ipc_perm *perm, int shmflg) LSM_HOOK(int, 0, shm_shmctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, shm_shmat, struct kern_ipc_perm *perm, char __user *shmaddr, int shmflg) LSM_HOOK(int, 0, sem_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, sem_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, sem_associate, struct kern_ipc_perm *perm, int semflg) LSM_HOOK(int, 0, sem_semctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, unsigned nsops, int alter) LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr, struct lsm_ctx __user *ctx, size_t *size, u32 flags) LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr, struct lsm_ctx *ctx, size_t size, u32 flags) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx, u32 *ctxlen) #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) #endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */ #if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) LSM_HOOK(int, 0, watch_key, struct key *key) #endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other, struct sock *newsk) LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other) LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern) LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb) LSM_HOOK(int, 0, socket_bind, struct socket *sock, struct sockaddr *address, int addrlen) LSM_HOOK(int, 0, socket_connect, struct socket *sock, struct sockaddr *address, int addrlen) LSM_HOOK(int, 0, socket_listen, struct socket *sock, int backlog) LSM_HOOK(int, 0, socket_accept, struct socket *sock, struct socket *newsock) LSM_HOOK(int, 0, socket_sendmsg, struct socket *sock, struct msghdr *msg, int size) LSM_HOOK(int, 0, socket_recvmsg, struct socket *sock, struct msghdr *msg, int size, int flags) LSM_HOOK(int, 0, socket_getsockname, struct socket *sock) LSM_HOOK(int, 0, socket_getpeername, struct socket *sock) LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how) LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock, struct sk_buff *skb, u32 *secid) LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk) LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk, struct sock *newsk) LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, const struct sock *sk, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent) LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb, struct request_sock *req) LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk, const struct request_sock *req) LSM_HOOK(void, LSM_RET_VOID, inet_conn_established, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void) LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req, struct flowi_common *flic) LSM_HOOK(int, 0, tun_dev_alloc_security, void **security) LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security) LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) LSM_HOOK(int, 0, tun_dev_open, void *security) LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) LSM_HOOK(int, 0, sctp_assoc_established, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, mptcp_add_subflow, struct sock *sk, struct sock *ssk) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey) LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name, u8 port_num) LSM_HOOK(int, 0, ib_alloc_security, void **sec) LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec) #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK(int, 0, xfrm_policy_alloc_security, struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) LSM_HOOK(int, 0, xfrm_policy_clone_security, struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx) LSM_HOOK(void, LSM_RET_VOID, xfrm_policy_free_security, struct xfrm_sec_ctx *ctx) LSM_HOOK(int, 0, xfrm_policy_delete_security, struct xfrm_sec_ctx *ctx) LSM_HOOK(int, 0, xfrm_state_alloc, struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid) LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid, int ckall) #endif /* CONFIG_SECURITY_NETWORK_XFRM */ /* key management security hooks */ #ifdef CONFIG_KEYS LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred, unsigned long flags) LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer) #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, void **lsmrule) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, struct path *path) LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) #ifdef CONFIG_PERF_EVENTS LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type) LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event) LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */
2 2 2 2 2 2 2 100 100 2 100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ptrace hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI */ #include <asm/current.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include "common.h" #include "cred.h" #include "ptrace.h" #include "ruleset.h" #include "setup.h" /** * domain_scope_le - Checks domain ordering for scoped ptrace * * @parent: Parent domain. * @child: Potential child of @parent. * * Checks if the @parent domain is less or equal to (i.e. an ancestor, which * means a subset of) the @child domain. */ static bool domain_scope_le(const struct landlock_ruleset *const parent, const struct landlock_ruleset *const child) { const struct landlock_hierarchy *walker; if (!parent) return true; if (!child) return false; for (walker = child->hierarchy; walker; walker = walker->parent) { if (walker == parent->hierarchy) /* @parent is in the scoped hierarchy of @child. */ return true; } /* There is no relationship between @parent and @child. */ return false; } static bool task_is_scoped(const struct task_struct *const parent, const struct task_struct *const child) { bool is_scoped; const struct landlock_ruleset *dom_parent, *dom_child; rcu_read_lock(); dom_parent = landlock_get_task_domain(parent); dom_child = landlock_get_task_domain(child); is_scoped = domain_scope_le(dom_parent, dom_child); rcu_read_unlock(); return is_scoped; } static int task_ptrace(const struct task_struct *const parent, const struct task_struct *const child) { /* Quick return for non-landlocked tasks. */ if (!landlocked(parent)) return 0; if (task_is_scoped(parent, child)) return 0; return -EPERM; } /** * hook_ptrace_access_check - Determines whether the current process may access * another * * @child: Process to be accessed. * @mode: Mode of attachment. * * If the current task has Landlock rules, then the child must have at least * the same rules. Else denied. * * Determines whether a process may access another, returning 0 if permission * granted, -errno if denied. */ static int hook_ptrace_access_check(struct task_struct *const child, const unsigned int mode) { return task_ptrace(current, child); } /** * hook_ptrace_traceme - Determines whether another process may trace the * current one * * @parent: Task proposed to be the tracer. * * If the parent has Landlock rules, then the current task must have the same * or more rules. Else denied. * * Determines whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -errno if denied. */ static int hook_ptrace_traceme(struct task_struct *const parent) { return task_ptrace(parent, current); } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme), }; __init void landlock_add_ptrace_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); }
1 40 40 41 112 78 78 434 435 347 11 6 5 4 7 28 13 2 11 2 13 18 3 2 2 3 8 1067 3 3 141 5 137 138 137 138 148 7 5 102 44 889 890 889 881 17 321 322 321 115 3 7 31 3 15 12 6 6 6 6 25 6 16 5 12 16 18 18 18 7 1 3 4 346 6 65 274 100 96 95 96 3 87 13 4 3 3 3 4 1 1 1 38 1 1 23 10 4 1 3 4 26 6 26 15 12 4 1 3 1 1 2080 2456 2085 2453 226 226 485 25 8 35 418 8 7 4 7 7 6 1 5 6 7 2 4 7 4 4 5 2 2 3 380 70 5 3 3 3 2 22 14 4 1 6 3 2 2 2 10 30 17 4 19 2 86 2 7 1 11 2 16 3 3 1 2 5 2 3 2 2 4 3 2 4 2 2 14 10 3 2 6 2 1 2 7 1 2 2 1 377 388 5 5 31 27 2 1 234 2 229 13 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 4 4 4 4 2 2 2 2 27 3 1 1 3 1 1 20 15 2 1 2 2 2 2 1 2 2 5 2 2 2 1 2 1 2 1 3 3 2 1 1 2 2 2 65 118 2846 1458 1468 2843 1520 1773 2846 428 647 2846 1466 1455 1064 1066 1064 1060 3 1062 1066 1067 1066 515 562 1066 1 197 1064 1240 679 577 1239 1 1239 1615 1852 2845 2845 2846 2847 2843 321 575 894 919 459 459 13 908 904 17 26 894 895 3501 678 1949 954 950 955 953 953 1 1951 2609 464 464 3570 3563 3569 6 3572 3572 195 60 196 161 12822 12818 158 158 1694 1697 2 154 154 154 119 119 2 118 20 33 33 1484 13 1472 260 786 787 786 257 260 71 51 10 37 3 1899 1900 1901 8 78 6 11 71 3 1890 36 24 7 1 4 4 3 6 4 1 25 19 5 11 10 476 423 1363 1275 73 343 251 1295 523 524 736 734 734 1200 1161 1091 1091 555 95 94 133 133 132 122 860 860 860 858 860 859 860 734 736 736 735 694 1 4 3 29 28 11 2 2 2 3 2 1 29 21 9 937 936 934 643 334 936 433 433 433 390 100 432 13917 13945 13901 13913 12242 1971 13905 654 652 6 654 638 20 655 22 951 944 19 577 121 1 117 3 860 482 739 425 374 213 2845 2185 675 1457 1468 2840 2179 672 22335 22197 730 22246 22075 1075 18723 4117 22239 22251 292 286 9 7 7 5 1 2 3 7 5 109 13 8 3 5 1 7 1838 92 25 70 3598 343 20 323 344 342 62 62 2 9 9 1 1 350 6 1 2 3 2 7 7 7 7 7 5 7 7 7 7 7 7 349 9 4 8 2 296 52 371 296 29 44 277 20 54 347 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> #include <linux/mroute.h> #include <linux/mroute6.h> #include <linux/icmpv6.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> #include <net/sock_reuseport.h> #include <net/bpf_sk_storage.h> #include <trace/events/sock.h> #include <net/tcp.h> #include <net/busy_poll.h> #include <net/phonet/phonet.h> #include <linux/ethtool.h> #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); static void sock_def_write_space_wfree(struct sock *sk); static void sock_def_write_space(struct sock *sk); /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns. */ bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap) { return file_ns_capable(sk->sk_socket->file, user_ns, cap) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces. */ bool sk_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, &init_user_ns, cap); } EXPORT_SYMBOL(sk_capable); /** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ bool sk_net_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ #define _sock_locks(x) \ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ x "27" , x "28" , x "AF_CAN" , \ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { _sock_locks("clock-") }; static const char *const af_family_kern_key_strings[AF_MAX+1] = { _sock_locks("k-sk_lock-") }; static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { _sock_locks("k-slock-") }; static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { _sock_locks("elock-") }; /* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; static struct lock_class_key af_rlock_keys[AF_MAX]; static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements */ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again. */ sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); void sk_error_report(struct sock *sk) { sk->sk_error_report(sk); switch (sk->sk_family) { case AF_INET: fallthrough; case AF_INET6: trace_inet_sk_error_report(sk); break; default: break; } } EXPORT_SYMBOL(sk_error_report); int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; tv.tv_usec = 0; } else { tv.tv_sec = timeo / HZ; tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); } if (old_timeval) { struct __kernel_old_timeval old_tv; old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; return sizeof(old_tv); } *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } EXPORT_SYMBOL(sock_get_timeout); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval) { if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) return -EINVAL; if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv->tv_sec = tv32.tv_sec; tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; if (optlen < sizeof(old_tv)) return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv->tv_sec = old_tv.tv_sec; tv->tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(*tv)) return -EINVAL; if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } return 0; } EXPORT_SYMBOL(sock_copy_user_timeval); static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval) { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); long val; if (err) return err; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", __func__, current->comm, task_pid_nr(current)); } return 0; } val = MAX_SCHEDULE_TIMEOUT; if ((tv.tv_sec || tv.tv_usec) && (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); WRITE_ONCE(*timeo_p, val); return 0; } static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { case AF_UNSPEC: case AF_UNIX: return false; default: return true; } } static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; if (sock_needs_netstamp(sk) && !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(__sock_queue_rcv_skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { enum skb_drop_reason drop_reason; int err; err = sk_filter(sk, skb); if (err) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto out; } err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; break; case -ENOBUFS: drop_reason = SKB_DROP_REASON_PROTO_MEM; break; default: drop_reason = SKB_NOT_DROPPED_YET; break; } out: if (reason) *reason = drop_reason; return err; } EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; } bh_unlock_sock(sk); out: if (refcounted) sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); goto out; } EXPORT_SYMBOL(__sk_receive_skb); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(__sk_dst_check); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(sk_dst_check); static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); /* Sorry... */ ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; if (ifindex < 0) goto out; /* Paired with all READ_ONCE() done locklessly. */ WRITE_ONCE(sk->sk_bound_dev_if, ifindex); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); ret = 0; out: #endif return ret; } int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; if (lock_sk) lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk) release_sock(sk); return ret; } EXPORT_SYMBOL(sock_bindtoindex); static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, devname); if (dev) index = dev->ifindex; rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; } sockopt_lock_sock(sk); ret = sock_bindtoindex_locked(sk, index); sockopt_release_sock(sk); out: #endif return ret; } static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; if (bound_dev_if == 0) { len = 0; goto zero; } ret = -EINVAL; if (len < IFNAMSIZ) goto out; ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; out: #endif return ret; } bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; /* IPV6_ADDRFORM can change sk->sk_family under us. */ switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); void sock_set_reuseaddr(struct sock *sk) { lock_sock(sk); sk->sk_reuse = SK_CAN_REUSE; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseaddr); void sock_set_reuseport(struct sock *sk) { lock_sock(sk); sk->sk_reuseport = true; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { WRITE_ONCE(sk->sk_priority, priority); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } void sock_enable_timestamps(struct sock *sk) { lock_sock(sk); __sock_set_timestamps(sk, true, false, true); release_sock(sk); } EXPORT_SYMBOL(sock_enable_timestamps); void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { case SO_TIMESTAMP_OLD: __sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW: __sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD: __sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW: __sock_set_timestamps(sk, valbool, true, true); break; } } static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) { struct net *net = sock_net(sk); struct net_device *dev = NULL; bool match = false; int *vclock_index; int i, num; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); if (!dev) { pr_err("%s: sock not bind to device\n", __func__); return -EOPNOTSUPP; } num = ethtool_get_phc_vclocks(dev, &vclock_index); dev_put(dev); for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; break; } } if (num > 0) kfree(vclock_index); if (!match) return -EINVAL; WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping) { int val = timestamping.flags; int ret; if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP && !(val & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP) atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); else atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } } if (val & SOF_TIMESTAMPING_OPT_STATS && !(val & SOF_TIMESTAMPING_OPT_TSONLY)) return -EINVAL; if (val & SOF_TIMESTAMPING_BIND_PHC) { ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); if (ret) return ret; } WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); return 0; } void sock_set_keepalive(struct sock *sk) { lock_sock(sk); if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, true); sock_valbool_flag(sk, SOCK_KEEPOPEN, true); release_sock(sk); } EXPORT_SYMBOL(sock_set_keepalive); static void __sock_set_rcvbuf(struct sock *sk, int val) { /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior. */ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); } void sock_set_rcvbuf(struct sock *sk, int val) { lock_sock(sk); __sock_set_rcvbuf(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } void sock_set_mark(struct sock *sk, u32 val) { lock_sock(sk); __sock_set_mark(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } static int sock_reserve_memory(struct sock *sk, int bytes) { long allocated; bool charged; int pages; if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) return 0; pages = sk_mem_pages(bytes); /* pre-charge to memcg */ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from * a bpf prog. bpf has ensured the sk lock has been * acquired before calling setsockopt(). */ if (has_current_bpf_ctx()) return; lock_sock(sk); } EXPORT_SYMBOL(sockopt_lock_sock); void sockopt_release_sock(struct sock *sk) { if (has_current_bpf_ctx()) return; release_sock(sk); } EXPORT_SYMBOL(sockopt_release_sock); bool sockopt_ns_capable(struct user_namespace *ns, int cap) { return has_current_bpf_ctx() || ns_capable(ns, cap); } EXPORT_SYMBOL(sockopt_ns_capable); bool sockopt_capable(int cap) { return has_current_bpf_ctx() || capable(cap); } EXPORT_SYMBOL(sockopt_capable); /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */ if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { sock_set_priority(sk, val); return 0; } return -EPERM; case SO_PASSSEC: assign_bit(SOCK_PASSSEC, &sock->flags, valbool); return 0; case SO_PASSCRED: assign_bit(SOCK_PASSCRED, &sock->flags, valbool); return 0; case SO_PASSPIDFD: assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: return -ENOPROTOOPT; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: if (val < 0) return -EINVAL; WRITE_ONCE(sk->sk_ll_usec, val); return 0; case SO_PREFER_BUSY_POLL: if (valbool && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); return 0; case SO_BUSY_POLL_BUDGET: if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; if (val < 0 || val > U16_MAX) return -EINVAL; WRITE_ONCE(sk->sk_busy_poll_budget, val); return 0; #endif case SO_MAX_PACING_RATE: { unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; unsigned long pacing_rate; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && copy_from_sockptr(&ulval, optval, sizeof(ulval))) { return -EFAULT; } if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); /* Pairs with READ_ONCE() from sk_getsockopt() */ WRITE_ONCE(sk->sk_max_pacing_rate, ulval); pacing_rate = READ_ONCE(sk->sk_pacing_rate); if (ulval < pacing_rate) WRITE_ONCE(sk->sk_pacing_rate, ulval); return 0; } case SO_TXREHASH: if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() * and sk_getsockopt(). */ WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; } sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ if (val < 0) val = 0; goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ __sock_set_rcvbuf(sk, max(val, 0)); break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; case SO_OOBINLINE: sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break; case SO_NO_CHECK: sk->sk_no_check_tx = valbool; break; case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ break; } if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); } else { unsigned long t_sec = ling.l_linger; if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: sock_set_timestamp(sk, optname, valbool); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(&timestamping, optval, sizeof(timestamping))) { ret = -EFAULT; break; } } else { memset(&timestamping, 0, sizeof(timestamping)); timestamping.flags = val; } ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: { int (*set_rcvlowat)(struct sock *sk, int val) = NULL; if (val < 0) val = INT_MAX; if (sock) set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; if (set_rcvlowat) ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } break; case SO_DETACH_REUSEPORT_BPF: ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) ret = -EPERM; else sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; case SO_RCVMARK: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; case SO_WIFI_STATUS: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; case SO_PEEK_OFF: { int (*set_peek_off)(struct sock *sk, int val); set_peek_off = READ_ONCE(sock->ops)->set_peek_off; if (set_peek_off) ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; case SO_SELECT_ERR_QUEUE: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: if (val == 1) dst_negative_advice(sk); break; case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; break; } /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); sk->sk_txtime_report_errors = !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: ret = sock_bindtoindex_locked(sk, val); break; case SO_BUF_LOCK: if (val & ~SOCK_BUF_LOCK_MASK) { ret = -EINVAL; break; } sk->sk_userlocks = val | (sk->sk_userlocks & ~SOCK_BUF_LOCK_MASK); break; case SO_RESERVE_MEM: { int delta; if (val < 0) { ret = -EINVAL; break; } delta = val - sk->sk_reserved_mem; if (delta < 0) sock_release_reserved_memory(sk, -delta); else ret = sock_reserve_memory(sk, delta); break; } default: ret = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return ret; } int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { return sk_setsockopt(sock->sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) { const struct cred *cred; spin_lock(&sk->sk_peer_lock); cred = get_cred(sk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); return cred; } static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; if (cred) { struct user_namespace *current_ns = current_user_ns(); ucred->uid = from_kuid_munged(current_ns, cred->euid); ucred->gid = from_kgid_munged(current_ns, cred->egid); } } static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) { gid_t gid = from_kgid_munged(user_ns, src->gid[i]); if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; } return 0; } int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; union { int val; u64 val64; unsigned long ulval; struct linger ling; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; struct so_timestamping timestamping; } v; int lv = sizeof(int); int len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; memset(&v, 0, sizeof(v)); switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; case SO_DONTROUTE: v.val = sock_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: v.val = sock_flag(sk, SOCK_BROADCAST); break; case SO_SNDBUF: v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: v.val = sk->sk_reuse; break; case SO_REUSEPORT: v.val = sk->sk_reuseport; break; case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; case SO_TYPE: v.val = sk->sk_type; break; case SO_PROTOCOL: v.val = sk->sk_protocol; break; case SO_DOMAIN: v.val = sk->sk_family; break; case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; case SO_OOBINLINE: v.val = sock_flag(sk, SOCK_URGINLINE); break; case SO_NO_CHECK: v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMP_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPNS_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: lv = sizeof(v.timestamping); /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only * returning the flags when they were set through the same option. * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. */ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); } break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: v.val = 1; break; case SO_PASSCRED: v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; case SO_PEERCRED: { struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } case SO_PEERPIDFD: { struct pid *peer_pid; struct file *pidfd_file = NULL; int pidfd; if (len > sizeof(pidfd)) len = sizeof(pidfd); spin_lock(&sk->sk_peer_lock); peer_pid = get_pid(sk->sk_peer_pid); spin_unlock(&sk->sk_peer_lock); if (!peer_pid) return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { put_unused_fd(pidfd); fput(pidfd_file); return -EFAULT; } fd_install(pidfd, pidfd_file); return 0; } case SO_PEERGROUPS: { const struct cred *cred; int ret, n; cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA; n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; goto lenout; } case SO_PEERNAME: { struct sockaddr_storage address; lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ case SO_ACCEPTCONN: v.val = sk->sk_state == TCP_LISTEN; break; case SO_PASSSEC: v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); case SO_MARK: v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: v.val = sock_flag(sk, SOCK_RCVMARK); break; case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; case SO_WIFI_STATUS: v.val = sock_flag(sk, SOCK_WIFI_STATUS); break; case SO_PEEK_OFF: if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; case SO_BINDTODEVICE: return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: len = sk_get_filter(sk, optval, len); if (len < 0) return len; goto lenout; case SO_LOCK_FILTER: v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; case SO_BPF_EXTENSIONS: v.val = bpf_tell_extensions(); break; case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); break; #endif case SO_MAX_PACING_RATE: /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ v.val = min_t(unsigned long, ~0U, READ_ONCE(sk->sk_max_pacing_rate)); } break; case SO_INCOMING_CPU: v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: { u32 meminfo[SK_MEMINFO_VARS]; sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; } #ifdef CONFIG_NET_RX_BUSY_POLL case SO_INCOMING_NAPI_ID: v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ if (v.val < MIN_NAPI_ID) v.val = 0; break; #endif case SO_COOKIE: lv = sizeof(u64); if (len < lv) return -EINVAL; v.val64 = sock_gen_cookie(sk); break; case SO_ZEROCOPY: v.val = sock_flag(sk, SOCK_ZEROCOPY); break; case SO_TXTIME: lv = sizeof(v.txtime); v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; v.txtime.flags |= sk->sk_txtime_report_errors ? SOF_TXTIME_REPORT_ERRORS : 0; break; case SO_BINDTOIFINDEX: v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: lv = sizeof(u64); if (len != lv) return -EINVAL; v.val64 = sock_net(sk)->net_cookie; break; case SO_BUF_LOCK: v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break; case SO_RESERVE_MEM: v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). */ return -ENOPROTOOPT; } if (len > lv) len = lv; if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } /* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.) */ static inline void sock_lock_init(struct sock *sk) { if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, af_family_kern_slock_key_strings[sk->sk_family], af_family_kern_slock_keys + sk->sk_family, af_family_kern_key_strings[sk->sk_family], af_family_kern_keys + sk->sk_family); else sock_lock_init_class_and_name( sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], af_family_keys + sk->sk_family); } /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarly, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif /* If we move sk_tx_queue_mapping out of the private section, * we must check if sk_tx_queue_clear() is called after * sock_copy() in sk_clone_lock(). */ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < offsetof(struct sock, sk_dontcopy_begin) || offsetof(struct sock, sk_tx_queue_mapping) >= offsetof(struct sock, sk_dontcopy_end)); memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; struct kmem_cache *slab; slab = prot->slab; if (slab != NULL) { sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; } return sk; out_free_sec: security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); return NULL; } static void sk_prot_free(struct proto *prot, struct sock *sk) { struct kmem_cache *slab; struct module *owner; owner = prot->owner; slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); module_put(owner); } /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; } /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); else __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); sk_prot_free(sk->sk_prot_creator, sk); } void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } static void __sk_free(struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } void sk_free(struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); static void sk_init_common(struct sock *sk) { skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_receive_queue.lock, af_rlock_keys + sk->sk_family, af_family_rlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_write_queue.lock, af_wlock_keys + sk->sk_family, af_family_wlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; bool is_charged = true; struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (!newsk) goto out; sock_copy(newsk, sk); newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } else { /* Kernel sockets are not elevating the struct net refcount. * Instead, use a tracker to more easily detect if a layer * is not properly dismantling its kernel sockets at netns * destroy time. */ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); RCU_INIT_POINTER(newsk->sk_filter, filter); rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter. */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); if (bpf_sk_storage_clone(sk, newsk)) { sk_free_unlock_clone(newsk); newsk = NULL; goto out; } /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. */ if (sk_user_data_is_nocopy(newsk)) newsk->sk_user_data = NULL; newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); /* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); out: return newsk; } EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_free_unlock_clone(struct sock *sk) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ sk->sk_destruct = NULL; bh_unlock_sock(sk); sk_free(sk); } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { bool is_ipv6 = false; u32 max_size; #if IS_ENABLED(CONFIG_IPV6) is_ipv6 = (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : READ_ONCE(dst->dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ /* * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); sock_def_write_space_wfree(sk); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); return; } /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } /* * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); /* This variant of sock_wfree() is used by TCP, * since it sets SOCK_USE_WRITE_QUEUE. */ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) { skb->destructor = sock_edemux; sock_hold(sk); return; } #endif skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ if (skb->decrypted) return false; #endif return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { if (skb_is_tcp_pure_ack(skb)) return; if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) return; skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); /* * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; atomic_sub(len, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); /* * Buffer destructor for skbs that are not used directly in read or write * path, e.g. for error handler skbs. Automatically called from kfree_skb. */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); /* Buffer destructor for prefetch/receive path where reference count may * not be held, e.g. for listen sockets. */ #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (!sk_is_refcounted(sk)) return; if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { inet_reqsk(sk)->rsk_listener = NULL; reqsk_free(inet_reqsk(sk)); return; } sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; read_unlock(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(__sock_i_ino); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; local_bh_disable(); ino = __sock_i_ino(sk); local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; } EXPORT_SYMBOL(sock_wmalloc); static void sock_ofree(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_omem_alloc); } struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { struct sk_buff *skb; /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); if (!skb) return NULL; atomic_add(skb->truesize, &sk->sk_omem_alloc); skb->sk = sk; skb->destructor = sock_ofree; return skb; } /* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); if (mem) return mem; atomic_sub(size, &sk->sk_omem_alloc); } return NULL; } EXPORT_SYMBOL(sock_kmalloc); /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. */ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify) { if (WARN_ON_ONCE(!mem)) return; if (nullify) kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } void sock_kfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, false); } EXPORT_SYMBOL(sock_kfree_s); void sock_kzfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, true); } EXPORT_SYMBOL(sock_kzfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Generic send/receive buffer handlers */ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { struct sk_buff *skb; long timeo; int err; timeo = sock_sndtimeo(sk, noblock); for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) goto failure; if (signal_pending(current)) goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation); if (skb) skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; } EXPORT_SYMBOL(sock_alloc_send_pskb); int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; tsflags = *(u32 *)CMSG_DATA(cmsg); if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) return -EINVAL; sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(__sock_cmsg_send); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } return 0; } EXPORT_SYMBOL(sock_cmsg_send); static void sk_enter_memory_pressure(struct sock *sk) { if (!sk->sk_prot->enter_memory_pressure) return; sk->sk_prot->enter_memory_pressure(sk); } static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; if (memory_pressure && READ_ONCE(*memory_pressure)) WRITE_ONCE(*memory_pressure, 0); } } DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; } EXPORT_SYMBOL(sk_page_frag_refill); void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); do { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); skb = next; } while (skb != NULL); spin_lock_bh(&sk->sk_lock.slock); } /* * Doing the zeroing here guarantee we can not loop forever * while a wild producer attempts to flood us. */ sk->sk_backlog.len = 0; } void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate * @amt: pages to allocate * @kind: allocation type * * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. * * Unlike the globally shared limits among the sockets under same protocol, * consuming the budget of a memcg won't have direct effect on other ones. * So be optimistic about memcg's tolerance, and leave the callers to decide * whether or not to raise allocated through sk_under_memory_pressure() or * its variants. */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) goto suppress_allocation; charged = true; } /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* Guarantee minimum buffer size under pressure (either global * or memcg) to make sure features described in RFC 7323 (TCP * Extensions for High Performance) work properly. * * This rule does NOT stand when exceeds global or memcg's hard * limit, or else a DoS attack can be taken place by spawning * lots of sockets whose usage are under minimum buffer size. */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ int wmem0 = sk_get_wmem0(sk, prot); if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < wmem0) return 1; } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; } } if (sk_has_memory_pressure(sk)) { u64 alloc; /* The following 'average' heuristic is within the * scope of global accounting, so it only makes * sense for global memory pressure. */ if (!sk_under_global_memory_pressure(sk)) return 1; /* Try to be fair among all the sockets under global * pressure by allowing the ones that below average * usage to raise. */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) return 1; } suppress_allocation: if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { sk_stream_moderate_sndbuf(sk); /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ if (memcg && !charged) { mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; } /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated * @sk: socket * @size: memory size to allocate * @kind: allocation type * * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means * rmem allocation. This function assumes that protocols which have * memory_pressure use sk_wmem_queued as write buffer accounting. */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket * @amount: number of quanta * * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain * cases where it makes no sense for a protocol to have a "do nothing" * function, some default processing is provided. */ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_shutdown); int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg_locked); int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } EXPORT_SYMBOL(sock_no_mmap); /* * When a file is received (via SCM_RIGHTS, etc), we must bump the * various sock-based usage counts. */ void __receive_sock(struct file *file) { struct socket *sock; sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } } /* * Default Socket Callbacks */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } void sock_def_readable(struct sock *sk) { struct socket_wq *wq; trace_sk_data_ready(sk); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* An optimised version of sock_def_write_space(), should only be called * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ static void sock_def_write_space_wfree(struct sock *sk) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ smp_mb__after_atomic(); if (wq && waitqueue_active(&wq->wait)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } } static void sock_def_destruct(struct sock *sk) { } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) { if (!mod_timer(timer, expires)) sock_hold(sk); } EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { if (del_timer_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); } sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) lockdep_set_class_and_name( &sk->sk_callback_lock, af_kern_callback_keys + sk->sk_family, af_family_kern_clock_key_strings[sk->sk_family]); else lockdep_set_class_and_name( &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; spin_lock_init(&sk->sk_peer_lock); sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; #if BITS_PER_LONG==32 seqlock_init(&sk->sk_stamp_seq); #endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data_uid); void sock_init_data(struct socket *sock, struct sock *sk) { kuid_t uid = sock ? SOCK_INODE(sock)->i_uid : make_kuid(sock_net(sk)->user_ns, 0); sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. * * The 'mutex' is not contended and holding * sock::sk_lock.slock prevents all other lockers to * proceed so the corresponding unlock_sock_fast() can * avoid the slow path of release_sock() completely and * just release slock. * * From a semantical POV this is equivalent to 'acquiring' * the 'mutex', hence the corresponding lockdep * mutex_release() has to happen in the fast path of * unlock_sock_fast(). */ return false; } __lock_sock(sk); sk->sk_lock.owned = 1; __acquire(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock); return true; } EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) { struct sock *sk = sock->sk; struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } if (timeval) ts.tv_nsec /= 1000; #ifdef CONFIG_COMPAT_32BIT_TIME if (time32) return put_old_timespec32(&ts, userstamp); #endif #ifdef CONFIG_SPARC64 /* beware of padding in sparc64 timeval */ if (timeval && !in_compat_syscall()) { struct __kernel_old_timeval __user tv = { .tv_sec = ts.tv_sec, .tv_usec = ts.tv_nsec, }; if (copy_to_user(userstamp, &tv, sizeof(tv))) return -EFAULT; return 0; } #endif return put_timespec64(&ts, userstamp); } EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ if (sock_needs_netstamp(sk) && !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; struct sk_buff *skb; int copied, err; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; out_free_skb: kfree_skb(skb); out: return err; } EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume * this means if you specify SO_ERROR (otherwise whats the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * * A. Remove from hash tables. */ sk->sk_prot->unhash(sk); /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and * did hash table lookup before we unhashed socket. They will achieve * receive queue and will be purged by socket destructor. * * Also we still have packets pending on receive queue and probably, * our own packets waiting in device queues. sock_destroy will drain * receive queue, but transmitted packets will delay socket destruction * until the last reference will be released. */ sock_orphan(sk); xfrm_sk_free_policy(sk); sock_put(sk); } EXPORT_SYMBOL(sk_common_release); void sk_get_meminfo(const struct sock *sk, u32 *mem) { memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } #ifdef CONFIG_PROC_FS static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; int res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } EXPORT_SYMBOL_GPL(sock_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; return 0; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, }; static __init int net_inuse_init(void) { if (register_pernet_subsys(&net_inuse_ops)) panic("Cannot initialize net inuse counters"); return 0; } core_initcall(net_inuse_init); static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); return 0; } static void release_proto_idx(struct proto *prot) { if (prot->inuse_idx != PROTO_INUSE_NR - 1) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else static inline int assign_proto_idx(struct proto *prot) { return 0; } static inline void release_proto_idx(struct proto *prot) { } #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) { if (!twsk_prot) return; kfree(twsk_prot->twsk_slab_name); twsk_prot->twsk_slab_name = NULL; kmem_cache_destroy(twsk_prot->twsk_slab); twsk_prot->twsk_slab = NULL; } static int tw_prot_init(const struct proto *prot) { struct timewait_sock_ops *twsk_prot = prot->twsk_prot; if (!twsk_prot) return 0; twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (!twsk_prot->twsk_slab_name) return -ENOMEM; twsk_prot->twsk_slab = kmem_cache_create(twsk_prot->twsk_slab_name, twsk_prot->twsk_obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!twsk_prot->twsk_slab) { pr_crit("%s: Can't create timewait sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; kmem_cache_destroy(rsk_prot->slab); rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) { struct request_sock_ops *rsk_prot = prot->rsk_prot; if (!rsk_prot) return 0; rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (!rsk_prot->slab_name) return -ENOMEM; rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; if (prot->memory_allocated && !prot->sysctl_mem) { pr_err("%s: missing sysctl_mem\n", prot->name); return -EINVAL; } if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); return -EINVAL; } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (req_prot_init(prot)) goto out_free_request_sock_slab; if (tw_prot_init(prot)) goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; out_free_timewait_sock_slab: if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); kmem_cache_destroy(prot->slab); prot->slab = NULL; } out: return ret; } EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); mutex_unlock(&proto_list_mutex); kmem_cache_destroy(prot->slab); prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); int sock_load_diag_module(int family, int protocol) { if (!protocol) { if (!sock_is_registered(family)) return -ENOENT; return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family); } #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family, protocol); } EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) { mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) __releases(proto_list_mutex) { mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } static long sock_prot_memory_allocated(struct proto *proto) { return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; } static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_memory_allocated(proto), sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), proto_method_implemented(proto->close), proto_method_implemented(proto->connect), proto_method_implemented(proto->disconnect), proto_method_implemented(proto->accept), proto_method_implemented(proto->ioctl), proto_method_implemented(proto->init), proto_method_implemented(proto->destroy), proto_method_implemented(proto->shutdown), proto_method_implemented(proto->setsockopt), proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), proto_method_implemented(proto->unhash), proto_method_implemented(proto->get_port), proto_method_implemented(proto->enter_memory_pressure)); } static int proto_seq_show(struct seq_file *seq, void *v) { if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", "sockets", "memory", "press", "maxhdr", "slab", "module", "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, .show = proto_seq_show, }; static __net_init int proto_init_net(struct net *net) { if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static __net_exit void proto_exit_net(struct net *net) { remove_proc_entry("protocols", net->proc_net); } static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, }; static int __init proto_init(void) { return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ #ifdef CONFIG_NET_RX_BUSY_POLL bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) return true; if (sk_is_udp(sk) && !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) return true; return sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); /* Copy 'size' bytes from userspace and return `size` back to userspace */ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size) { int ret; if (copy_from_user(karg, arg, size)) return -EFAULT; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); if (ret) return ret; if (copy_to_user(arg, karg, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(sock_ioctl_inout); /* This is the most common ioctl prep function, where the result (4 bytes) is * copied back to userspace if the ioctl() returns successfully. No input is * copied from userspace as input argument. */ static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) { int ret, karg = 0; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); if (ret) return ret; return put_user(karg, (int __user *)arg); } /* A wrapper around sock ioctls, which copies the data from userspace * (depending on the protocol/ioctl), and copies back the result to userspace. * The main motivation for this function is to pass kernel memory to the * protocol ioctl callbacks, instead of userspace memory. */ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int rc = 1; if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) rc = ipmr_sk_ioctl(sk, cmd, arg); else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) rc = ip6mr_sk_ioctl(sk, cmd, arg); else if (sk_is_phonet(sk)) rc = phonet_sk_ioctl(sk, cmd, arg); /* If ioctl was processed, returns its value */ if (rc <= 0) return rc; /* Otherwise call the default handler */ return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_INODE_ITEM_H #define BTRFS_INODE_ITEM_H #include <linux/types.h> #include <linux/crc32c.h> struct fscrypt_str; struct extent_buffer; struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; struct btrfs_truncate_control; /* * Return this if we need to call truncate_block for the last bit of the * truncate. */ #define BTRFS_NEED_TRUNCATE_BLOCK 1 struct btrfs_truncate_control { /* * IN: the inode we're operating on, this can be NULL if * ->clear_extent_range is false. */ struct btrfs_inode *inode; /* IN: the size we're truncating to. */ u64 new_size; /* OUT: the number of extents truncated. */ u64 extents_found; /* OUT: the last size we truncated this inode to. */ u64 last_size; /* OUT: the number of bytes to sub from this inode. */ u64 sub_bytes; /* IN: the ino we are truncating. */ u64 ino; /* * IN: minimum key type to remove. All key types with this type are * removed only if their offset >= new_size. */ u32 min_type; /* * IN: true if we don't want to do extent reference updates for any file * extents we drop. */ bool skip_ref_updates; /* * IN: true if we need to clear the file extent range for the inode as * we drop the file extent items. */ bool clear_extent_range; }; /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two * separate u32s. These two functions convert between the two representations. */ static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) { return (flags | ((u64)ro_flags << 32)); } static inline void btrfs_inode_split_flags(u64 inode_item_flags, u32 *flags, u32 *ro_flags) { *flags = (u32)inode_item_flags; *ro_flags = (u32)(inode_item_flags >> 32); } /* Figure the key offset of an extended inode ref. */ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { return (u64)crc32c(parent_objectid, name, len); } int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif
4 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 // SPDX-License-Identifier: GPL-2.0-or-later /* Linux driver for Philips webcam USB and Video4Linux interface part. (C) 1999-2004 Nemosoft Unv. (C) 2004-2006 Luc Saillard (luc@saillard.org) (C) 2011 Hans de Goede <hdegoede@redhat.com> NOTE: this version of pwc is an unofficial (modified) release of pwc & pcwx driver and thus may have bugs that are not present in the original version. Please send bug reports and support requests to <luc@saillard.org>. The decompression routines have been implemented by reverse-engineering the Nemosoft binary pwcx module. Caveat emptor. */ /* This code forms the interface between the USB layers and the Philips specific stuff. Some adanved stuff of the driver falls under an NDA, signed between me and Philips B.V., Eindhoven, the Netherlands, and is thus not distributed in source form. The binary pwcx.o module contains the code that falls under the NDA. In case you're wondering: 'pwc' stands for "Philips WebCam", but I really didn't want to type 'philips_web_cam' every time (I'm lazy as any Linux kernel hacker, but I don't like uncomprehensible abbreviations without explanation). Oh yes, convention: to disctinguish between all the various pointers to device-structures, I use these names for the pointer variables: udev: struct usb_device * vdev: struct video_device (member of pwc_dev) pdev: struct pwc_devive * */ /* Contributors: - Alvarado: adding whitebalance code - Alistar Moire: QuickCam 3000 Pro device/product ID - Tony Hoyle: Creative Labs Webcam 5 device/product ID - Mark Burazin: solving hang in VIDIOCSYNC when camera gets unplugged - Jk Fang: Sotec Afina Eye ID - Xavier Roche: QuickCam Pro 4000 ID - Jens Knudsen: QuickCam Zoom ID - J. Debert: QuickCam for Notebooks ID - Pham Thanh Nam: webcam snapshot button as an event input device */ #include <linux/errno.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/slab.h> #ifdef CONFIG_USB_PWC_INPUT_EVDEV #include <linux/usb/input.h> #endif #include <linux/vmalloc.h> #include <asm/io.h> #include <linux/kernel.h> /* simple_strtol() */ #include "pwc.h" #include "pwc-kiara.h" #include "pwc-timon.h" #include "pwc-dec23.h" #include "pwc-dec1.h" #define CREATE_TRACE_POINTS #include <trace/events/pwc.h> /* Function prototypes and driver templates */ /* hotplug device table support */ static const struct usb_device_id pwc_device_table [] = { { USB_DEVICE(0x041E, 0x400C) }, /* Creative Webcam 5 */ { USB_DEVICE(0x041E, 0x4011) }, /* Creative Webcam Pro Ex */ { USB_DEVICE(0x046D, 0x08B0) }, /* Logitech QuickCam 3000 Pro */ { USB_DEVICE(0x046D, 0x08B1) }, /* Logitech QuickCam Notebook Pro */ { USB_DEVICE(0x046D, 0x08B2) }, /* Logitech QuickCam 4000 Pro */ { USB_DEVICE(0x046D, 0x08B3) }, /* Logitech QuickCam Zoom (old model) */ { USB_DEVICE(0x046D, 0x08B4) }, /* Logitech QuickCam Zoom (new model) */ { USB_DEVICE(0x046D, 0x08B5) }, /* Logitech QuickCam Orbit/Sphere */ { USB_DEVICE(0x046D, 0x08B6) }, /* Logitech/Cisco VT Camera */ { USB_DEVICE(0x046D, 0x08B7) }, /* Logitech ViewPort AV 100 */ { USB_DEVICE(0x046D, 0x08B8) }, /* Logitech QuickCam */ { USB_DEVICE(0x0471, 0x0302) }, /* Philips PCA645VC */ { USB_DEVICE(0x0471, 0x0303) }, /* Philips PCA646VC */ { USB_DEVICE(0x0471, 0x0304) }, /* Askey VC010 type 2 */ { USB_DEVICE(0x0471, 0x0307) }, /* Philips PCVC675K (Vesta) */ { USB_DEVICE(0x0471, 0x0308) }, /* Philips PCVC680K (Vesta Pro) */ { USB_DEVICE(0x0471, 0x030C) }, /* Philips PCVC690K (Vesta Pro Scan) */ { USB_DEVICE(0x0471, 0x0310) }, /* Philips PCVC730K (ToUCam Fun)/PCVC830 (ToUCam II) */ { USB_DEVICE(0x0471, 0x0311) }, /* Philips PCVC740K (ToUCam Pro)/PCVC840 (ToUCam II) */ { USB_DEVICE(0x0471, 0x0312) }, /* Philips PCVC750K (ToUCam Pro Scan) */ { USB_DEVICE(0x0471, 0x0313) }, /* Philips PCVC720K/40 (ToUCam XS) */ { USB_DEVICE(0x0471, 0x0329) }, /* Philips SPC 900NC webcam */ { USB_DEVICE(0x0471, 0x032C) }, /* Philips SPC 880NC webcam */ { USB_DEVICE(0x04CC, 0x8116) }, /* Sotec Afina Eye */ { USB_DEVICE(0x055D, 0x9000) }, /* Samsung MPC-C10 */ { USB_DEVICE(0x055D, 0x9001) }, /* Samsung MPC-C30 */ { USB_DEVICE(0x055D, 0x9002) }, /* Samsung SNC-35E (Ver3.0) */ { USB_DEVICE(0x069A, 0x0001) }, /* Askey VC010 type 1 */ { USB_DEVICE(0x06BE, 0x8116) }, /* AME Co. Afina Eye */ { USB_DEVICE(0x0d81, 0x1900) }, /* Visionite VCS-UC300 */ { USB_DEVICE(0x0d81, 0x1910) }, /* Visionite VCS-UM100 */ { } }; MODULE_DEVICE_TABLE(usb, pwc_device_table); static int usb_pwc_probe(struct usb_interface *intf, const struct usb_device_id *id); static void usb_pwc_disconnect(struct usb_interface *intf); static void pwc_isoc_cleanup(struct pwc_device *pdev); static struct usb_driver pwc_driver = { .name = "Philips webcam", /* name */ .id_table = pwc_device_table, .probe = usb_pwc_probe, /* probe() */ .disconnect = usb_pwc_disconnect, /* disconnect() */ }; #define MAX_DEV_HINTS 20 #define MAX_ISOC_ERRORS 20 #ifdef CONFIG_USB_PWC_DEBUG int pwc_trace = PWC_DEBUG_LEVEL; #endif static int power_save = -1; static int leds[2] = { 100, 0 }; /***/ static const struct v4l2_file_operations pwc_fops = { .owner = THIS_MODULE, .open = v4l2_fh_open, .release = vb2_fop_release, .read = vb2_fop_read, .poll = vb2_fop_poll, .mmap = vb2_fop_mmap, .unlocked_ioctl = video_ioctl2, }; static const struct video_device pwc_template = { .name = "Philips Webcam", /* Filled in later */ .release = video_device_release_empty, .fops = &pwc_fops, .ioctl_ops = &pwc_ioctl_ops, }; /***************************************************************************/ /* Private functions */ static void *pwc_alloc_urb_buffer(struct usb_device *dev, size_t size, dma_addr_t *dma_handle) { struct device *dmadev = dev->bus->sysdev; void *buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return NULL; *dma_handle = dma_map_single(dmadev, buffer, size, DMA_FROM_DEVICE); if (dma_mapping_error(dmadev, *dma_handle)) { kfree(buffer); return NULL; } return buffer; } static void pwc_free_urb_buffer(struct usb_device *dev, size_t size, void *buffer, dma_addr_t dma_handle) { struct device *dmadev = dev->bus->sysdev; dma_unmap_single(dmadev, dma_handle, size, DMA_FROM_DEVICE); kfree(buffer); } static struct pwc_frame_buf *pwc_get_next_fill_buf(struct pwc_device *pdev) { unsigned long flags = 0; struct pwc_frame_buf *buf = NULL; spin_lock_irqsave(&pdev->queued_bufs_lock, flags); if (list_empty(&pdev->queued_bufs)) goto leave; buf = list_entry(pdev->queued_bufs.next, struct pwc_frame_buf, list); list_del(&buf->list); leave: spin_unlock_irqrestore(&pdev->queued_bufs_lock, flags); return buf; } static void pwc_snapshot_button(struct pwc_device *pdev, int down) { if (down) { PWC_TRACE("Snapshot button pressed.\n"); } else { PWC_TRACE("Snapshot button released.\n"); } #ifdef CONFIG_USB_PWC_INPUT_EVDEV if (pdev->button_dev) { input_report_key(pdev->button_dev, KEY_CAMERA, down); input_sync(pdev->button_dev); } #endif } static void pwc_frame_complete(struct pwc_device *pdev) { struct pwc_frame_buf *fbuf = pdev->fill_buf; /* The ToUCam Fun CMOS sensor causes the firmware to send 2 or 3 bogus frames on the USB wire after an exposure change. This conditition is however detected in the cam and a bit is set in the header. */ if (pdev->type == 730) { unsigned char *ptr = (unsigned char *)fbuf->data; if (ptr[1] == 1 && ptr[0] & 0x10) { PWC_TRACE("Hyundai CMOS sensor bug. Dropping frame.\n"); pdev->drop_frames += 2; } if ((ptr[0] ^ pdev->vmirror) & 0x01) { pwc_snapshot_button(pdev, ptr[0] & 0x01); } if ((ptr[0] ^ pdev->vmirror) & 0x02) { if (ptr[0] & 0x02) PWC_TRACE("Image is mirrored.\n"); else PWC_TRACE("Image is normal.\n"); } pdev->vmirror = ptr[0] & 0x03; /* Sometimes the trailer of the 730 is still sent as a 4 byte packet after a short frame; this condition is filtered out specifically. A 4 byte frame doesn't make sense anyway. So we get either this sequence: drop_bit set -> 4 byte frame -> short frame -> good frame Or this one: drop_bit set -> short frame -> good frame So we drop either 3 or 2 frames in all! */ if (fbuf->filled == 4) pdev->drop_frames++; } else if (pdev->type == 740 || pdev->type == 720) { unsigned char *ptr = (unsigned char *)fbuf->data; if ((ptr[0] ^ pdev->vmirror) & 0x01) { pwc_snapshot_button(pdev, ptr[0] & 0x01); } pdev->vmirror = ptr[0] & 0x03; } /* In case we were instructed to drop the frame, do so silently. */ if (pdev->drop_frames > 0) { pdev->drop_frames--; } else { /* Check for underflow first */ if (fbuf->filled < pdev->frame_total_size) { PWC_DEBUG_FLOW("Frame buffer underflow (%d bytes); discarded.\n", fbuf->filled); } else { fbuf->vb.field = V4L2_FIELD_NONE; fbuf->vb.sequence = pdev->vframe_count; vb2_buffer_done(&fbuf->vb.vb2_buf, VB2_BUF_STATE_DONE); pdev->fill_buf = NULL; pdev->vsync = 0; } } /* !drop_frames */ pdev->vframe_count++; } /* This gets called for the Isochronous pipe (video). This is done in * interrupt time, so it has to be fast, not crash, and not stall. Neat. */ static void pwc_isoc_handler(struct urb *urb) { struct pwc_device *pdev = (struct pwc_device *)urb->context; struct device *dmadev = urb->dev->bus->sysdev; int i, fst, flen; unsigned char *iso_buf = NULL; trace_pwc_handler_enter(urb, pdev); if (urb->status == -ENOENT || urb->status == -ECONNRESET || urb->status == -ESHUTDOWN) { PWC_DEBUG_OPEN("URB (%p) unlinked %ssynchronously.\n", urb, urb->status == -ENOENT ? "" : "a"); return; } if (pdev->fill_buf == NULL) pdev->fill_buf = pwc_get_next_fill_buf(pdev); if (urb->status != 0) { const char *errmsg; errmsg = "Unknown"; switch(urb->status) { case -ENOSR: errmsg = "Buffer error (overrun)"; break; case -EPIPE: errmsg = "Stalled (device not responding)"; break; case -EOVERFLOW: errmsg = "Babble (bad cable?)"; break; case -EPROTO: errmsg = "Bit-stuff error (bad cable?)"; break; case -EILSEQ: errmsg = "CRC/Timeout (could be anything)"; break; case -ETIME: errmsg = "Device does not respond"; break; } PWC_ERROR("pwc_isoc_handler() called with status %d [%s].\n", urb->status, errmsg); /* Give up after a number of contiguous errors */ if (++pdev->visoc_errors > MAX_ISOC_ERRORS) { PWC_ERROR("Too many ISOC errors, bailing out.\n"); if (pdev->fill_buf) { vb2_buffer_done(&pdev->fill_buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); pdev->fill_buf = NULL; } } pdev->vsync = 0; /* Drop the current frame */ goto handler_end; } /* Reset ISOC error counter. We did get here, after all. */ pdev->visoc_errors = 0; dma_sync_single_for_cpu(dmadev, urb->transfer_dma, urb->transfer_buffer_length, DMA_FROM_DEVICE); /* vsync: 0 = don't copy data 1 = sync-hunt 2 = synched */ /* Compact data */ for (i = 0; i < urb->number_of_packets; i++) { fst = urb->iso_frame_desc[i].status; flen = urb->iso_frame_desc[i].actual_length; iso_buf = urb->transfer_buffer + urb->iso_frame_desc[i].offset; if (fst != 0) { PWC_ERROR("Iso frame %d has error %d\n", i, fst); continue; } if (flen > 0 && pdev->vsync) { struct pwc_frame_buf *fbuf = pdev->fill_buf; if (pdev->vsync == 1) { fbuf->vb.vb2_buf.timestamp = ktime_get_ns(); pdev->vsync = 2; } if (flen + fbuf->filled > pdev->frame_total_size) { PWC_ERROR("Frame overflow (%d > %d)\n", flen + fbuf->filled, pdev->frame_total_size); pdev->vsync = 0; /* Let's wait for an EOF */ } else { memcpy(fbuf->data + fbuf->filled, iso_buf, flen); fbuf->filled += flen; } } if (flen < pdev->vlast_packet_size) { /* Shorter packet... end of frame */ if (pdev->vsync == 2) pwc_frame_complete(pdev); if (pdev->fill_buf == NULL) pdev->fill_buf = pwc_get_next_fill_buf(pdev); if (pdev->fill_buf) { pdev->fill_buf->filled = 0; pdev->vsync = 1; } } pdev->vlast_packet_size = flen; } dma_sync_single_for_device(dmadev, urb->transfer_dma, urb->transfer_buffer_length, DMA_FROM_DEVICE); handler_end: trace_pwc_handler_exit(urb, pdev); i = usb_submit_urb(urb, GFP_ATOMIC); if (i != 0) PWC_ERROR("Error (%d) re-submitting urb in pwc_isoc_handler.\n", i); } /* Both v4l2_lock and vb_queue_lock should be locked when calling this */ static int pwc_isoc_init(struct pwc_device *pdev) { struct usb_device *udev; struct urb *urb; int i, j, ret; struct usb_interface *intf; struct usb_host_interface *idesc = NULL; int compression = 0; /* 0..3 = uncompressed..high */ pdev->vsync = 0; pdev->vlast_packet_size = 0; pdev->fill_buf = NULL; pdev->vframe_count = 0; pdev->visoc_errors = 0; udev = pdev->udev; retry: /* We first try with low compression and then retry with a higher compression setting if there is not enough bandwidth. */ ret = pwc_set_video_mode(pdev, pdev->width, pdev->height, pdev->pixfmt, pdev->vframes, &compression, 1); /* Get the current alternate interface, adjust packet size */ intf = usb_ifnum_to_if(udev, 0); if (intf) idesc = usb_altnum_to_altsetting(intf, pdev->valternate); if (!idesc) return -EIO; /* Search video endpoint */ pdev->vmax_packet_size = -1; for (i = 0; i < idesc->desc.bNumEndpoints; i++) { if ((idesc->endpoint[i].desc.bEndpointAddress & 0xF) == pdev->vendpoint) { pdev->vmax_packet_size = le16_to_cpu(idesc->endpoint[i].desc.wMaxPacketSize); break; } } if (pdev->vmax_packet_size < 0 || pdev->vmax_packet_size > ISO_MAX_FRAME_SIZE) { PWC_ERROR("Failed to find packet size for video endpoint in current alternate setting.\n"); return -ENFILE; /* Odd error, that should be noticeable */ } /* Set alternate interface */ PWC_DEBUG_OPEN("Setting alternate interface %d\n", pdev->valternate); ret = usb_set_interface(pdev->udev, 0, pdev->valternate); if (ret == -ENOSPC && compression < 3) { compression++; goto retry; } if (ret < 0) return ret; /* Allocate and init Isochronuous urbs */ for (i = 0; i < MAX_ISO_BUFS; i++) { urb = usb_alloc_urb(ISO_FRAMES_PER_DESC, GFP_KERNEL); if (urb == NULL) { pwc_isoc_cleanup(pdev); return -ENOMEM; } pdev->urbs[i] = urb; PWC_DEBUG_MEMORY("Allocated URB at 0x%p\n", urb); urb->interval = 1; // devik urb->dev = udev; urb->pipe = usb_rcvisocpipe(udev, pdev->vendpoint); urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; urb->transfer_buffer_length = ISO_BUFFER_SIZE; urb->transfer_buffer = pwc_alloc_urb_buffer(udev, urb->transfer_buffer_length, &urb->transfer_dma); if (urb->transfer_buffer == NULL) { PWC_ERROR("Failed to allocate urb buffer %d\n", i); pwc_isoc_cleanup(pdev); return -ENOMEM; } urb->complete = pwc_isoc_handler; urb->context = pdev; urb->start_frame = 0; urb->number_of_packets = ISO_FRAMES_PER_DESC; for (j = 0; j < ISO_FRAMES_PER_DESC; j++) { urb->iso_frame_desc[j].offset = j * ISO_MAX_FRAME_SIZE; urb->iso_frame_desc[j].length = pdev->vmax_packet_size; } } /* link */ for (i = 0; i < MAX_ISO_BUFS; i++) { ret = usb_submit_urb(pdev->urbs[i], GFP_KERNEL); if (ret == -ENOSPC && compression < 3) { compression++; pwc_isoc_cleanup(pdev); goto retry; } if (ret) { PWC_ERROR("isoc_init() submit_urb %d failed with error %d\n", i, ret); pwc_isoc_cleanup(pdev); return ret; } PWC_DEBUG_MEMORY("URB 0x%p submitted.\n", pdev->urbs[i]); } /* All is done... */ PWC_DEBUG_OPEN("<< pwc_isoc_init()\n"); return 0; } static void pwc_iso_stop(struct pwc_device *pdev) { int i; /* Unlinking ISOC buffers one by one */ for (i = 0; i < MAX_ISO_BUFS; i++) { if (pdev->urbs[i]) { PWC_DEBUG_MEMORY("Unlinking URB %p\n", pdev->urbs[i]); usb_kill_urb(pdev->urbs[i]); } } } static void pwc_iso_free(struct pwc_device *pdev) { int i; /* Freeing ISOC buffers one by one */ for (i = 0; i < MAX_ISO_BUFS; i++) { struct urb *urb = pdev->urbs[i]; if (urb) { PWC_DEBUG_MEMORY("Freeing URB\n"); if (urb->transfer_buffer) pwc_free_urb_buffer(urb->dev, urb->transfer_buffer_length, urb->transfer_buffer, urb->transfer_dma); usb_free_urb(urb); pdev->urbs[i] = NULL; } } } /* Both v4l2_lock and vb_queue_lock should be locked when calling this */ static void pwc_isoc_cleanup(struct pwc_device *pdev) { PWC_DEBUG_OPEN(">> pwc_isoc_cleanup()\n"); pwc_iso_stop(pdev); pwc_iso_free(pdev); usb_set_interface(pdev->udev, 0, 0); PWC_DEBUG_OPEN("<< pwc_isoc_cleanup()\n"); } /* Must be called with vb_queue_lock hold */ static void pwc_cleanup_queued_bufs(struct pwc_device *pdev, enum vb2_buffer_state state) { unsigned long flags = 0; spin_lock_irqsave(&pdev->queued_bufs_lock, flags); while (!list_empty(&pdev->queued_bufs)) { struct pwc_frame_buf *buf; buf = list_entry(pdev->queued_bufs.next, struct pwc_frame_buf, list); list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, state); } spin_unlock_irqrestore(&pdev->queued_bufs_lock, flags); } #ifdef CONFIG_USB_PWC_DEBUG static const char *pwc_sensor_type_to_string(unsigned int sensor_type) { switch(sensor_type) { case 0x00: return "Hyundai CMOS sensor"; case 0x20: return "Sony CCD sensor + TDA8787"; case 0x2E: return "Sony CCD sensor + Exas 98L59"; case 0x2F: return "Sony CCD sensor + ADI 9804"; case 0x30: return "Sharp CCD sensor + TDA8787"; case 0x3E: return "Sharp CCD sensor + Exas 98L59"; case 0x3F: return "Sharp CCD sensor + ADI 9804"; case 0x40: return "UPA 1021 sensor"; case 0x100: return "VGA sensor"; case 0x101: return "PAL MR sensor"; default: return "unknown type of sensor"; } } #endif /***************************************************************************/ /* Video4Linux functions */ static void pwc_video_release(struct v4l2_device *v) { struct pwc_device *pdev = container_of(v, struct pwc_device, v4l2_dev); v4l2_ctrl_handler_free(&pdev->ctrl_handler); v4l2_device_unregister(&pdev->v4l2_dev); kfree(pdev->ctrl_buf); kfree(pdev); } /***************************************************************************/ /* Videobuf2 operations */ static int queue_setup(struct vb2_queue *vq, unsigned int *nbuffers, unsigned int *nplanes, unsigned int sizes[], struct device *alloc_devs[]) { struct pwc_device *pdev = vb2_get_drv_priv(vq); int size; if (*nbuffers < MIN_FRAMES) *nbuffers = MIN_FRAMES; else if (*nbuffers > MAX_FRAMES) *nbuffers = MAX_FRAMES; *nplanes = 1; size = pwc_get_size(pdev, MAX_WIDTH, MAX_HEIGHT); sizes[0] = PAGE_ALIGN(pwc_image_sizes[size][0] * pwc_image_sizes[size][1] * 3 / 2); return 0; } static int buffer_init(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); /* need vmalloc since frame buffer > 128K */ buf->data = vzalloc(PWC_FRAME_SIZE); if (buf->data == NULL) return -ENOMEM; return 0; } static int buffer_prepare(struct vb2_buffer *vb) { struct pwc_device *pdev = vb2_get_drv_priv(vb->vb2_queue); /* Don't allow queueing new buffers after device disconnection */ if (!pdev->udev) return -ENODEV; return 0; } static void buffer_finish(struct vb2_buffer *vb) { struct pwc_device *pdev = vb2_get_drv_priv(vb->vb2_queue); struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); if (vb->state == VB2_BUF_STATE_DONE) { /* * Application has called dqbuf and is getting back a buffer * we've filled, take the pwc data we've stored in buf->data * and decompress it into a usable format, storing the result * in the vb2_buffer. */ pwc_decompress(pdev, buf); } } static void buffer_cleanup(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); vfree(buf->data); } static void buffer_queue(struct vb2_buffer *vb) { struct pwc_device *pdev = vb2_get_drv_priv(vb->vb2_queue); struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct pwc_frame_buf *buf = container_of(vbuf, struct pwc_frame_buf, vb); unsigned long flags = 0; /* Check the device has not disconnected between prep and queuing */ if (!pdev->udev) { vb2_buffer_done(vb, VB2_BUF_STATE_ERROR); return; } spin_lock_irqsave(&pdev->queued_bufs_lock, flags); list_add_tail(&buf->list, &pdev->queued_bufs); spin_unlock_irqrestore(&pdev->queued_bufs_lock, flags); } static int start_streaming(struct vb2_queue *vq, unsigned int count) { struct pwc_device *pdev = vb2_get_drv_priv(vq); int r; if (!pdev->udev) return -ENODEV; if (mutex_lock_interruptible(&pdev->v4l2_lock)) return -ERESTARTSYS; /* Turn on camera and set LEDS on */ pwc_camera_power(pdev, 1); pwc_set_leds(pdev, leds[0], leds[1]); r = pwc_isoc_init(pdev); if (r) { /* If we failed turn camera and LEDS back off */ pwc_set_leds(pdev, 0, 0); pwc_camera_power(pdev, 0); /* And cleanup any queued bufs!! */ pwc_cleanup_queued_bufs(pdev, VB2_BUF_STATE_QUEUED); } mutex_unlock(&pdev->v4l2_lock); return r; } static void stop_streaming(struct vb2_queue *vq) { struct pwc_device *pdev = vb2_get_drv_priv(vq); mutex_lock(&pdev->v4l2_lock); if (pdev->udev) { pwc_set_leds(pdev, 0, 0); pwc_camera_power(pdev, 0); pwc_isoc_cleanup(pdev); } pwc_cleanup_queued_bufs(pdev, VB2_BUF_STATE_ERROR); if (pdev->fill_buf) vb2_buffer_done(&pdev->fill_buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); mutex_unlock(&pdev->v4l2_lock); } static const struct vb2_ops pwc_vb_queue_ops = { .queue_setup = queue_setup, .buf_init = buffer_init, .buf_prepare = buffer_prepare, .buf_finish = buffer_finish, .buf_cleanup = buffer_cleanup, .buf_queue = buffer_queue, .start_streaming = start_streaming, .stop_streaming = stop_streaming, .wait_prepare = vb2_ops_wait_prepare, .wait_finish = vb2_ops_wait_finish, }; /***************************************************************************/ /* USB functions */ /* This function gets called when a new device is plugged in or the usb core * is loaded. */ static int usb_pwc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct pwc_device *pdev = NULL; int vendor_id, product_id, type_id; int rc; int features = 0; int compression = 0; int my_power_save = power_save; char serial_number[30], *name; vendor_id = le16_to_cpu(udev->descriptor.idVendor); product_id = le16_to_cpu(udev->descriptor.idProduct); /* Check if we can handle this device */ PWC_DEBUG_PROBE("probe() called [%04X %04X], if %d\n", vendor_id, product_id, intf->altsetting->desc.bInterfaceNumber); /* the interfaces are probed one by one. We are only interested in the video interface (0) now. Interface 1 is the Audio Control, and interface 2 Audio itself. */ if (intf->altsetting->desc.bInterfaceNumber > 0) return -ENODEV; if (vendor_id == 0x0471) { switch (product_id) { case 0x0302: PWC_INFO("Philips PCA645VC USB webcam detected.\n"); name = "Philips 645 webcam"; type_id = 645; break; case 0x0303: PWC_INFO("Philips PCA646VC USB webcam detected.\n"); name = "Philips 646 webcam"; type_id = 646; break; case 0x0304: PWC_INFO("Askey VC010 type 2 USB webcam detected.\n"); name = "Askey VC010 webcam"; type_id = 646; break; case 0x0307: PWC_INFO("Philips PCVC675K (Vesta) USB webcam detected.\n"); name = "Philips 675 webcam"; type_id = 675; break; case 0x0308: PWC_INFO("Philips PCVC680K (Vesta Pro) USB webcam detected.\n"); name = "Philips 680 webcam"; type_id = 680; break; case 0x030C: PWC_INFO("Philips PCVC690K (Vesta Pro Scan) USB webcam detected.\n"); name = "Philips 690 webcam"; type_id = 690; break; case 0x0310: PWC_INFO("Philips PCVC730K (ToUCam Fun)/PCVC830 (ToUCam II) USB webcam detected.\n"); name = "Philips 730 webcam"; type_id = 730; break; case 0x0311: PWC_INFO("Philips PCVC740K (ToUCam Pro)/PCVC840 (ToUCam II) USB webcam detected.\n"); name = "Philips 740 webcam"; type_id = 740; break; case 0x0312: PWC_INFO("Philips PCVC750K (ToUCam Pro Scan) USB webcam detected.\n"); name = "Philips 750 webcam"; type_id = 750; break; case 0x0313: PWC_INFO("Philips PCVC720K/40 (ToUCam XS) USB webcam detected.\n"); name = "Philips 720K/40 webcam"; type_id = 720; break; case 0x0329: PWC_INFO("Philips SPC 900NC USB webcam detected.\n"); name = "Philips SPC 900NC webcam"; type_id = 740; break; case 0x032C: PWC_INFO("Philips SPC 880NC USB webcam detected.\n"); name = "Philips SPC 880NC webcam"; type_id = 740; break; default: return -ENODEV; } } else if (vendor_id == 0x069A) { switch(product_id) { case 0x0001: PWC_INFO("Askey VC010 type 1 USB webcam detected.\n"); name = "Askey VC010 webcam"; type_id = 645; break; default: return -ENODEV; } } else if (vendor_id == 0x046d) { switch(product_id) { case 0x08b0: PWC_INFO("Logitech QuickCam Pro 3000 USB webcam detected.\n"); name = "Logitech QuickCam Pro 3000"; type_id = 740; /* CCD sensor */ break; case 0x08b1: PWC_INFO("Logitech QuickCam Notebook Pro USB webcam detected.\n"); name = "Logitech QuickCam Notebook Pro"; type_id = 740; /* CCD sensor */ break; case 0x08b2: PWC_INFO("Logitech QuickCam 4000 Pro USB webcam detected.\n"); name = "Logitech QuickCam Pro 4000"; type_id = 740; /* CCD sensor */ if (my_power_save == -1) my_power_save = 1; break; case 0x08b3: PWC_INFO("Logitech QuickCam Zoom USB webcam detected.\n"); name = "Logitech QuickCam Zoom"; type_id = 740; /* CCD sensor */ break; case 0x08B4: PWC_INFO("Logitech QuickCam Zoom (new model) USB webcam detected.\n"); name = "Logitech QuickCam Zoom"; type_id = 740; /* CCD sensor */ if (my_power_save == -1) my_power_save = 1; break; case 0x08b5: PWC_INFO("Logitech QuickCam Orbit/Sphere USB webcam detected.\n"); name = "Logitech QuickCam Orbit"; type_id = 740; /* CCD sensor */ if (my_power_save == -1) my_power_save = 1; features |= FEATURE_MOTOR_PANTILT; break; case 0x08b6: PWC_INFO("Logitech/Cisco VT Camera webcam detected.\n"); name = "Cisco VT Camera"; type_id = 740; /* CCD sensor */ break; case 0x08b7: PWC_INFO("Logitech ViewPort AV 100 webcam detected.\n"); name = "Logitech ViewPort AV 100"; type_id = 740; /* CCD sensor */ break; case 0x08b8: /* Where this released? */ PWC_INFO("Logitech QuickCam detected (reserved ID).\n"); name = "Logitech QuickCam (res.)"; type_id = 730; /* Assuming CMOS */ break; default: return -ENODEV; } } else if (vendor_id == 0x055d) { /* I don't know the difference between the C10 and the C30; I suppose the difference is the sensor, but both cameras work equally well with a type_id of 675 */ switch(product_id) { case 0x9000: PWC_INFO("Samsung MPC-C10 USB webcam detected.\n"); name = "Samsung MPC-C10"; type_id = 675; break; case 0x9001: PWC_INFO("Samsung MPC-C30 USB webcam detected.\n"); name = "Samsung MPC-C30"; type_id = 675; break; case 0x9002: PWC_INFO("Samsung SNC-35E (v3.0) USB webcam detected.\n"); name = "Samsung MPC-C30"; type_id = 740; break; default: return -ENODEV; } } else if (vendor_id == 0x041e) { switch(product_id) { case 0x400c: PWC_INFO("Creative Labs Webcam 5 detected.\n"); name = "Creative Labs Webcam 5"; type_id = 730; if (my_power_save == -1) my_power_save = 1; break; case 0x4011: PWC_INFO("Creative Labs Webcam Pro Ex detected.\n"); name = "Creative Labs Webcam Pro Ex"; type_id = 740; break; default: return -ENODEV; } } else if (vendor_id == 0x04cc) { switch(product_id) { case 0x8116: PWC_INFO("Sotec Afina Eye USB webcam detected.\n"); name = "Sotec Afina Eye"; type_id = 730; break; default: return -ENODEV; } } else if (vendor_id == 0x06be) { switch(product_id) { case 0x8116: /* This is essentially the same cam as the Sotec Afina Eye */ PWC_INFO("AME Co. Afina Eye USB webcam detected.\n"); name = "AME Co. Afina Eye"; type_id = 750; break; default: return -ENODEV; } } else if (vendor_id == 0x0d81) { switch(product_id) { case 0x1900: PWC_INFO("Visionite VCS-UC300 USB webcam detected.\n"); name = "Visionite VCS-UC300"; type_id = 740; /* CCD sensor */ break; case 0x1910: PWC_INFO("Visionite VCS-UM100 USB webcam detected.\n"); name = "Visionite VCS-UM100"; type_id = 730; /* CMOS sensor */ break; default: return -ENODEV; } } else return -ENODEV; /* Not any of the know types; but the list keeps growing. */ if (my_power_save == -1) my_power_save = 0; memset(serial_number, 0, 30); usb_string(udev, udev->descriptor.iSerialNumber, serial_number, 29); PWC_DEBUG_PROBE("Device serial number is %s\n", serial_number); if (udev->descriptor.bNumConfigurations > 1) PWC_WARNING("Warning: more than 1 configuration available.\n"); /* Allocate structure, initialize pointers, mutexes, etc. and link it to the usb_device */ pdev = kzalloc(sizeof(struct pwc_device), GFP_KERNEL); if (pdev == NULL) { PWC_ERROR("Oops, could not allocate memory for pwc_device.\n"); return -ENOMEM; } pdev->type = type_id; pdev->features = features; pwc_construct(pdev); /* set min/max sizes correct */ mutex_init(&pdev->v4l2_lock); mutex_init(&pdev->vb_queue_lock); spin_lock_init(&pdev->queued_bufs_lock); INIT_LIST_HEAD(&pdev->queued_bufs); pdev->udev = udev; pdev->power_save = my_power_save; /* Init videobuf2 queue structure */ pdev->vb_queue.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; pdev->vb_queue.io_modes = VB2_MMAP | VB2_USERPTR | VB2_READ; pdev->vb_queue.drv_priv = pdev; pdev->vb_queue.buf_struct_size = sizeof(struct pwc_frame_buf); pdev->vb_queue.ops = &pwc_vb_queue_ops; pdev->vb_queue.mem_ops = &vb2_vmalloc_memops; pdev->vb_queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; rc = vb2_queue_init(&pdev->vb_queue); if (rc < 0) { PWC_ERROR("Oops, could not initialize vb2 queue.\n"); goto err_free_mem; } /* Init video_device structure */ pdev->vdev = pwc_template; strscpy(pdev->vdev.name, name, sizeof(pdev->vdev.name)); pdev->vdev.queue = &pdev->vb_queue; pdev->vdev.queue->lock = &pdev->vb_queue_lock; video_set_drvdata(&pdev->vdev, pdev); pdev->release = le16_to_cpu(udev->descriptor.bcdDevice); PWC_DEBUG_PROBE("Release: %04x\n", pdev->release); /* Allocate USB command buffers */ pdev->ctrl_buf = kmalloc(sizeof(pdev->cmd_buf), GFP_KERNEL); if (!pdev->ctrl_buf) { PWC_ERROR("Oops, could not allocate memory for pwc_device.\n"); rc = -ENOMEM; goto err_free_mem; } #ifdef CONFIG_USB_PWC_DEBUG /* Query sensor type */ if (pwc_get_cmos_sensor(pdev, &rc) >= 0) { PWC_DEBUG_OPEN("This %s camera is equipped with a %s (%d).\n", pdev->vdev.name, pwc_sensor_type_to_string(rc), rc); } #endif /* Set the leds off */ pwc_set_leds(pdev, 0, 0); /* Setup initial videomode */ rc = pwc_set_video_mode(pdev, MAX_WIDTH, MAX_HEIGHT, V4L2_PIX_FMT_YUV420, 30, &compression, 1); if (rc) goto err_free_mem; /* Register controls (and read default values from camera */ rc = pwc_init_controls(pdev); if (rc) { PWC_ERROR("Failed to register v4l2 controls (%d).\n", rc); goto err_free_mem; } /* And powerdown the camera until streaming starts */ pwc_camera_power(pdev, 0); /* Register the v4l2_device structure */ pdev->v4l2_dev.release = pwc_video_release; rc = v4l2_device_register(&intf->dev, &pdev->v4l2_dev); if (rc) { PWC_ERROR("Failed to register v4l2-device (%d).\n", rc); goto err_free_controls; } pdev->v4l2_dev.ctrl_handler = &pdev->ctrl_handler; pdev->vdev.v4l2_dev = &pdev->v4l2_dev; pdev->vdev.lock = &pdev->v4l2_lock; pdev->vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; rc = video_register_device(&pdev->vdev, VFL_TYPE_VIDEO, -1); if (rc < 0) { PWC_ERROR("Failed to register as video device (%d).\n", rc); goto err_unregister_v4l2_dev; } PWC_INFO("Registered as %s.\n", video_device_node_name(&pdev->vdev)); #ifdef CONFIG_USB_PWC_INPUT_EVDEV /* register webcam snapshot button input device */ pdev->button_dev = input_allocate_device(); if (!pdev->button_dev) { rc = -ENOMEM; goto err_video_unreg; } usb_make_path(udev, pdev->button_phys, sizeof(pdev->button_phys)); strlcat(pdev->button_phys, "/input0", sizeof(pdev->button_phys)); pdev->button_dev->name = "PWC snapshot button"; pdev->button_dev->phys = pdev->button_phys; usb_to_input_id(pdev->udev, &pdev->button_dev->id); pdev->button_dev->dev.parent = &pdev->udev->dev; pdev->button_dev->evbit[0] = BIT_MASK(EV_KEY); pdev->button_dev->keybit[BIT_WORD(KEY_CAMERA)] = BIT_MASK(KEY_CAMERA); rc = input_register_device(pdev->button_dev); if (rc) { input_free_device(pdev->button_dev); pdev->button_dev = NULL; goto err_video_unreg; } #endif return 0; #ifdef CONFIG_USB_PWC_INPUT_EVDEV err_video_unreg: video_unregister_device(&pdev->vdev); #endif err_unregister_v4l2_dev: v4l2_device_unregister(&pdev->v4l2_dev); err_free_controls: v4l2_ctrl_handler_free(&pdev->ctrl_handler); err_free_mem: kfree(pdev->ctrl_buf); kfree(pdev); return rc; } /* The user yanked out the cable... */ static void usb_pwc_disconnect(struct usb_interface *intf) { struct v4l2_device *v = usb_get_intfdata(intf); struct pwc_device *pdev = container_of(v, struct pwc_device, v4l2_dev); mutex_lock(&pdev->vb_queue_lock); mutex_lock(&pdev->v4l2_lock); /* No need to keep the urbs around after disconnection */ if (pdev->vb_queue.streaming) pwc_isoc_cleanup(pdev); pdev->udev = NULL; v4l2_device_disconnect(&pdev->v4l2_dev); video_unregister_device(&pdev->vdev); mutex_unlock(&pdev->v4l2_lock); mutex_unlock(&pdev->vb_queue_lock); #ifdef CONFIG_USB_PWC_INPUT_EVDEV if (pdev->button_dev) input_unregister_device(pdev->button_dev); #endif v4l2_device_put(&pdev->v4l2_dev); } /* * Initialization code & module stuff */ static unsigned int leds_nargs; #ifdef CONFIG_USB_PWC_DEBUG module_param_named(trace, pwc_trace, int, 0644); #endif module_param(power_save, int, 0644); module_param_array(leds, int, &leds_nargs, 0444); #ifdef CONFIG_USB_PWC_DEBUG MODULE_PARM_DESC(trace, "For debugging purposes"); #endif MODULE_PARM_DESC(power_save, "Turn power saving for new cameras on or off"); MODULE_PARM_DESC(leds, "LED on,off time in milliseconds"); MODULE_DESCRIPTION("Philips & OEM USB webcam driver"); MODULE_AUTHOR("Luc Saillard <luc@saillard.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS("pwcx"); MODULE_VERSION( PWC_VERSION ); module_usb_driver(pwc_driver);
438 35 65 19 20 20 20 20 20 17 34 38 1 29 1 20 46 1 20 2 103 1 50 17 20 66 44 20 79 49 9 49 49 65 32 63 27 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 /* SPDX-License-Identifier: GPL-2.0 */ /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> #include "mptcp_pm_gen.h" #define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) #define OPTION_MPTCP_MPJ_SYN BIT(3) #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_RM_ADDR BIT(7) #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) #define OPTION_MPTCP_DSS BIT(11) #define OPTION_MPTCP_FAIL BIT(12) #define OPTION_MPTCP_CSUMREQD BIT(13) #define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ OPTION_MPTCP_MPC_ACK) #define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 #define MPTCPOPT_MP_JOIN 1 #define MPTCPOPT_DSS 2 #define MPTCPOPT_ADD_ADDR 3 #define MPTCPOPT_RM_ADDR 4 #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 #define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_MPJ_SYN 12 #define TCPOLEN_MPTCP_MPJ_SYNACK 16 #define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 #define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 #define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 #define TCPOLEN_MPTCP_ADD_ADDR6 28 #define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 #define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 #define TCPOLEN_MPTCP_FAIL 12 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) #define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) #define MPTCP_DSS_DSN64 BIT(3) #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) /* MPTCP TCPRST flags */ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ #define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 #define MPTCP_CLEAN_UNA 2 #define MPTCP_ERROR_REPORT 3 #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; u8 has_rxtstamp:1; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; } #define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; u64 data_ack; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u16 suboptions; u32 token; u32 nonce; u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, reset_reason:4, reset_transient:1, echo:1, backup:1, deny_join_id0:1, __unused:2; u8 join_id; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | ((nib & 0xF) << 8) | field); } enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is * accounted int id_avail_bitmap */ }; enum mptcp_pm_type { MPTCP_PM_TYPE_KERNEL = 0, MPTCP_PM_TYPE_USERSPACE, __MPTCP_PM_TYPE_NR, __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, }; /* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ #define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_RM_ADDR_SIGNAL, }; /* max value of mptcp_addr_info.id */ #define MPTCP_PM_MAX_ADDR_ID U8_MAX struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; u8 subflows; u8 status; DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; u8 flags; int ifindex; struct socket *lsk; }; struct mptcp_data_frag { struct list_head list; u64 data_seq; u16 data_len; u16 offset; u16 overhead; u16 already_sent; struct page *page; }; /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; u64 local_key; /* protected by the first subflow socket lock * lockless access read */ u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; int rmem_fwd_alloc; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; * recovery related fields are under data_lock * protection */ u64 bytes_acked; u64 snd_una; u64 wnd_end; unsigned long timer_ival; u32 token; int rmem_released; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; u8 pending_state; /* A subflow asked to set this sk_state, * protected by the msk data lock */ u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, nodelay:1, fastopening:1, in_accept_queue:1, free_first:1, rcvspace_init:1; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; struct sock *first; /* The mptcp ops can safely dereference, using suitable * ONCE annotation, the subflow outside the socket * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; u8 scaling_ratio; u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) static inline void msk_owned_by_me(const struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); } #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) /* the msk socket don't use the backlog, also account for the bulk * free memory */ static inline int __mptcp_rmem(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); } static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); return READ_ONCE(msk->first_pending); } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *cur; cur = msk->first_pending; return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : list_next_entry(cur, list); } static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } struct csum_pseudo_header { __be64 data_seq; __be32 subflow_seq; __be16 data_len; __sum16 csum; }; struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; u64 idsn; u32 token; u32 ssn_offset; u64 thmac; u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * mptcp_subflow_rsk(const struct request_sock *rsk) { return (struct mptcp_subflow_request_sock *)rsk; } struct mptcp_delegated_action { struct napi_struct napi; struct list_head head; }; DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); #define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_ACK 2 #define MPTCP_DELEGATE_SNDBUF 3 #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ struct_group(reset, unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; u64 map_seq; u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; __wsum map_data_csum; u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ fully_established : 1, /* path validated */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, map_csum_reqd : 1, map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ __unused : 9; bool data_avail; bool scheduled; u32 remote_nonce; u64 thmac; u32 local_nonce; u32 remote_token; union { u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ u64 iasn; /* initial ack sequence number, MPC subflows only */ }; u8 local_id; u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; u8 stale_count; u32 subflow_id; long delegated_status; unsigned long fail_tout; ); struct list_head delegated_node; /* link into delegated_action, protected by local BH */ u32 setsockopt_seq; u32 stale_rcv_tstamp; int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf, * protected by the msk socket lock */ struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_state_change)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; } static inline struct sock * mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) { return subflow->tcp_sock; } static inline void mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; } static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - subflow->ssn_offset - subflow->map_subflow_seq; } static inline u64 mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) { return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); /* The implied barrier pairs with tcp_release_cb_override() * mptcp_napi_poll(), and ensures the below list check sees list * updates done prior to delegated status bits changes */ old = set_mask_bits(&subflow->delegated_status, 0, set_bits); if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); } } static inline struct mptcp_subflow_context * mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; if (list_empty(&delegated->head)) return NULL; ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); return ret; } int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled); struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) { return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed); } static inline bool mptcp_epollin_ready(const struct sock *sk) { /* mptcp doesn't have to deal with small skbs in the receive queue, * at it can always coalesce them */ return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) || (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) || READ_ONCE(tcp_memory_pressure); } int mptcp_set_rcvlowat(struct sock *sk, int val); static inline bool __tcp_can_send(const struct sock *ssk) { /* only send if our side has not closed yet */ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !subflow->fully_established) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); void mptcp_subflow_drop_ctx(struct sock *ssk); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void __mptcp_sync_state(struct sock *sk, int state); void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); static inline void mptcp_stop_tout_timer(struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; sk_stop_timer(sk, &sk->sk_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout) { /* avoid 0 timestamp, as that means no close timeout */ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1; } static inline void mptcp_start_tout_timer(struct sock *sk) { mptcp_set_close_tout(sk, tcp_jiffies32); mptcp_reset_tout_timer(mptcp_sk(sk), 0); } static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && READ_ONCE(mptcp_sk(sk)->fully_established); } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) { if (use_64bit) return cur_seq; return __mptcp_expand_seq(old_seq, cur_seq); } void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } static inline void __mptcp_sync_sndbuf(struct sock *sk) { struct mptcp_subflow_context *subflow; int ssk_sndbuf, new_sndbuf; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0]; mptcp_for_each_subflow(mptcp_sk(sk), subflow) { ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); subflow->cached_sndbuf = ssk_sndbuf; new_sndbuf += ssk_sndbuf; } /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf); } /* The called held both the msk socket and the subflow socket locks, * possibly under BH */ static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf) __mptcp_sync_sndbuf(sk); } /* the caller held only the subflow socket lock, either in process or * BH context. Additionally this can be called under the msk data lock, * so we can't acquire such lock here: let the delegate action acquires * the needed locks in suitable order. */ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf)) return; local_bh_disable(); mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); local_bh_enable(); } static inline void mptcp_write_space(struct sock *sk) { if (sk_stream_is_writeable(sk)) { /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags)) sk_stream_write_space(sk); } } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { mptcp_subflow_rsk(req)->token_node.pprev = NULL; } int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry); bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow); void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); int mptcp_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup); int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup); int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list); void mptcp_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; } static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; if (family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; /* account for 2 trailing 'nop' options */ if (port) len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN; return len; } static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) { if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) return -EINVAL; return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) { spin_lock_bh(&msk->pm.lock); __mptcp_pm_close_subflow(msk); spin_unlock_bh(&msk->pm.lock); } void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) { return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool mptcp_check_fallback(const struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); return __mptcp_check_fallback(msk); } static inline void __mptcp_do_fallback(struct mptcp_sock *msk) { if (__mptcp_check_fallback(msk)) { pr_debug("TCP fallback already done (msk=%p)", msk); return; } set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) { struct sock *ssk = READ_ONCE(msk->first); return ssk && ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_LISTEN)); } static inline void mptcp_do_fallback(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(sk); __mptcp_do_fallback(msk); if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { gfp_t saved_allocation = ssk->sk_allocation; /* we are in a atomic (BH) scope, override ssk default for data * fin allocation */ ssk->sk_allocation = GFP_ATOMIC; ssk->sk_shutdown |= SEND_SHUTDOWN; tcp_shutdown(ssk, SEND_SHUTDOWN); ssk->sk_allocation = saved_allocation; } } #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { struct mptcp_ext *mpext; mpext = skb ? mptcp_get_ext(skb) : NULL; if (mpext && mpext->infinite_map) return true; return false; } static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) { return (subflow->request_mptcp || subflow->request_join); } static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && is_active_ssk(subflow) && !subflow->conn_finished; } #ifdef CONFIG_SYN_COOKIES void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); void __init mptcp_join_cookie_init(void); #else static inline void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) {} static inline bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) { return false; } static inline void mptcp_join_cookie_init(void) {} #endif #endif /* __MPTCP_PROTOCOL_H */
2 1 1 1864 368 1864 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> /* for ipv4 options. */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) { if (dst_xfrm(dst)) return FLOW_OFFLOAD_XMIT_XFRM; return FLOW_OFFLOAD_XMIT_NEIGH; } static void nft_default_forward_path(struct nf_flow_route *route, struct dst_entry *dst_cache, enum ip_conntrack_dir dir) { route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; route->tuple[dir].dst = dst_cache; route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); } static bool nft_is_valid_ether_device(const struct net_device *dev) { if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return false; return true; } static int nft_dev_fill_forward_path(const struct nf_flow_route *route, const struct dst_entry *dst_cache, const struct nf_conn *ct, enum ip_conntrack_dir dir, u8 *ha, struct net_device_path_stack *stack) { const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; struct net_device *dev = dst_cache->dev; struct neighbour *n; u8 nud_state; if (!nft_is_valid_ether_device(dev)) goto out; n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -1; read_lock_bh(&n->lock); nud_state = n->nud_state; ether_addr_copy(ha, n->ha); read_unlock_bh(&n->lock); neigh_release(n); if (!(nud_state & NUD_VALID)) return -1; out: return dev_fill_forward_path(dev, ha, stack); } struct nft_forward_info { const struct net_device *indev; const struct net_device *outdev; const struct net_device *hw_outdev; struct id { __u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; u8 num_encaps; u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; enum flow_offload_xmit_type xmit_type; }; static void nft_dev_path_info(const struct net_device_path_stack *stack, struct nft_forward_info *info, unsigned char *ha, struct nf_flowtable *flowtable) { const struct net_device_path *path; int i; memcpy(info->h_dest, ha, ETH_ALEN); for (i = 0; i < stack->num_paths; i++) { path = &stack->path[i]; switch (path->type) { case DEV_PATH_ETHERNET: case DEV_PATH_DSA: case DEV_PATH_VLAN: case DEV_PATH_PPPOE: info->indev = path->dev; if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); if (path->type == DEV_PATH_ETHERNET) break; if (path->type == DEV_PATH_DSA) { i = stack->num_paths; break; } /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { info->indev = NULL; break; } if (!info->outdev) info->outdev = path->dev; info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; if (path->type == DEV_PATH_PPPOE) memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); switch (path->bridge.vlan_mode) { case DEV_PATH_BR_VLAN_UNTAG_HW: info->ingress_vlans |= BIT(info->num_encaps - 1); break; case DEV_PATH_BR_VLAN_TAG: info->encap[info->num_encaps].id = path->bridge.vlan_id; info->encap[info->num_encaps].proto = path->bridge.vlan_proto; info->num_encaps++; break; case DEV_PATH_BR_VLAN_UNTAG: info->num_encaps--; break; case DEV_PATH_BR_VLAN_KEEP: break; } info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; break; default: info->indev = NULL; break; } } if (!info->outdev) info->outdev = info->indev; info->hw_outdev = info->indev; if (nf_flowtable_hw_offload(flowtable) && nft_is_valid_ether_device(info->indev)) info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; } static bool nft_flowtable_find_dev(const struct net_device *dev, struct nft_flowtable *ft) { struct nft_hook *hook; bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { if (hook->ops.dev != dev) continue; found = true; break; } return found; } static void nft_dev_forward_path(struct nf_flow_route *route, const struct nf_conn *ct, enum ip_conntrack_dir dir, struct nft_flowtable *ft) { const struct dst_entry *dst = route->tuple[dir].dst; struct net_device_path_stack stack; struct nft_forward_info info = {}; unsigned char ha[ETH_ALEN]; int i; if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) nft_dev_path_info(&stack, &info, ha, &ft->data); if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) return; route->tuple[!dir].in.ifindex = info.indev->ifindex; for (i = 0; i < info.num_encaps; i++) { route->tuple[!dir].in.encap[i].id = info.encap[i].id; route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; } route->tuple[!dir].in.num_encaps = info.num_encaps; route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].out.ifindex = info.outdev->ifindex; route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; route->tuple[dir].xmit_type = info.xmit_type; } } static int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, struct nf_flow_route *route, enum ip_conntrack_dir dir, struct nft_flowtable *ft) { struct dst_entry *this_dst = skb_dst(pkt->skb); struct dst_entry *other_dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(fl)); switch (nft_pf(pkt)) { case NFPROTO_IPV4: fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; case NFPROTO_IPV6: fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); fl.u.ip6.flowi6_mark = pkt->skb->mark; fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; break; } if (!dst_hold_safe(this_dst)) return -ENOENT; nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); if (!other_dst) { dst_release(this_dst); return -ENOENT; } nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { nft_dev_forward_path(route, ct, dir, ft); nft_dev_forward_path(route, ct, !dir, ft); } return 0; } static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module");
8 686 688 70 686 682 681 759 733 5 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_DST_METADATA_H #define __NET_DST_METADATA_H 1 #include <linux/skbuff.h> #include <net/ip_tunnels.h> #include <net/macsec.h> #include <net/dst.h> enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, METADATA_MACSEC, METADATA_XFRM, }; struct hw_port_info { struct net_device *lower_dev; u32 port_id; }; struct macsec_info { sci_t sci; }; struct xfrm_md_info { u32 if_id; int link; struct dst_entry *dst_orig; }; struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; struct macsec_info macsec_info; struct xfrm_md_info xfrm_info; } u; }; static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); if (md_dst && md_dst->dst.flags & DST_METADATA) return md_dst; return NULL; } static inline struct ip_tunnel_info * skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_IP_TUNNEL) return &md_dst->u.tun_info; dst = skb_dst(skb); if (dst && dst->lwtstate && (dst->lwtstate->type == LWTUNNEL_ENCAP_IP || dst->lwtstate->type == LWTUNNEL_ENCAP_IP6)) return lwt_tun_info(dst->lwtstate); return NULL; } static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt) { return (struct xfrm_md_info *)lwt->data; } static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_XFRM) return &md_dst->u.xfrm_info; dst = skb_dst(skb); if (dst && dst->lwtstate && dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM) return lwt_xfrm_info(dst->lwtstate); return NULL; } static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); return dst && !(dst->flags & DST_METADATA); } static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { const struct metadata_dst *a, *b; if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) return 0; a = (const struct metadata_dst *) skb_dst(skb_a); b = (const struct metadata_dst *) skb_dst(skb_b); if (!a != !b || a->type != b->type) return 1; switch (a->type) { case METADATA_HW_PORT_MUX: return memcmp(&a->u.port_info, &b->u.port_info, sizeof(a->u.port_info)); case METADATA_IP_TUNNEL: return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); case METADATA_MACSEC: return memcmp(&a->u.macsec_info, &b->u.macsec_info, sizeof(a->u.macsec_info)); case METADATA_XFRM: return memcmp(&a->u.xfrm_info, &b->u.xfrm_info, sizeof(a->u.xfrm_info)); default: return 1; } } void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!tun_dst) return NULL; tun_dst->u.tun_info.options_len = 0; tun_dst->u.tun_info.mode = 0; return tun_dst; } static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); int md_size; struct metadata_dst *new_md; if (!md_dst || md_dst->type != METADATA_IP_TUNNEL) return ERR_PTR(-EINVAL); md_size = md_dst->u.tun_info.options_len; new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!new_md) return ERR_PTR(-ENOMEM); memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { int ret; ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); if (ret) { metadata_dst_free(new_md); return ERR_PTR(ret); } } #endif skb_dst_drop(skb); skb_dst_set(skb, &new_md->dst); return new_md; } static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb) { struct metadata_dst *dst; dst = tun_dst_unclone(skb); if (IS_ERR(dst)) return NULL; return &dst->u.tun_info; } static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, __be16 flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; ip_tunnel_key_init(&tun_dst->u.tun_info.key, saddr, daddr, tos, ttl, 0, 0, tp_dst, tunnel_id, flags); return tun_dst; } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, __be16 flags, __be64 tunnel_id, int md_size) { const struct iphdr *iph = ip_hdr(skb); return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, 0, flags, tunnel_id, md_size); } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, __be16 flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; info->key.tun_flags = flags; info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; info->key.u.ipv6.src = *saddr; info->key.u.ipv6.dst = *daddr; info->key.tos = tos; info->key.ttl = ttl; info->key.label = label; return tun_dst; } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, __be16 flags, __be64 tunnel_id, int md_size) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, ipv6_get_dsfield(ip6h), ip6h->hop_limit, 0, ip6_flowlabel(ip6h), flags, tunnel_id, md_size); } #endif /* __NET_DST_METADATA_H */
4 20 20 20 6 10 16 16 16 13 13 3 3 3 3 3 3 95 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 // SPDX-License-Identifier: GPL-2.0-only /* Helper handling for netfilter. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/random.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_log.h> #include <net/ip.h> static DEFINE_MUTEX(nf_ct_helper_mutex); struct hlist_head *nf_ct_helper_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hash); unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) { return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^ (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { if (strcmp(h->name, name)) continue; if (h->tuple.src.l3num != NFPROTO_UNSPEC && h->tuple.src.l3num != l3num) continue; if (h->tuple.dst.protonum == protonum) return h; } } return NULL; } EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); struct nf_conntrack_helper * nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); #ifdef CONFIG_MODULES if (h == NULL) { rcu_read_unlock(); if (request_module("nfct-helper-%s", name) == 0) { rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); } else { return h; } } #endif if (h != NULL && !try_module_get(h->me)) h = NULL; if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) { module_put(h->me); h = NULL; } rcu_read_unlock(); return h; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) { refcount_dec(&helper->refcnt); module_put(helper->me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); static struct nf_conntrack_nat_helper * nf_conntrack_nat_helper_find(const char *mod_name) { struct nf_conntrack_nat_helper *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_nat_helpers, list) { if (!strcmp(cur->mod_name, mod_name)) { found = true; break; } } return found ? cur : NULL; } int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; struct nf_conntrack_nat_helper *nat; char mod_name[NF_CT_HELPER_NAME_LEN]; int ret = 0; rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); if (!h) { rcu_read_unlock(); return -ENOENT; } nat = nf_conntrack_nat_helper_find(h->nat_mod_name); if (!nat) { snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name); rcu_read_unlock(); request_module("%s", mod_name); rcu_read_lock(); nat = nf_conntrack_nat_helper_find(mod_name); if (!nat) { rcu_read_unlock(); return -ENOENT; } } if (!try_module_get(nat->module)) ret = -ENOENT; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(nf_nat_helper_try_module_get); void nf_nat_helper_put(struct nf_conntrack_helper *helper) { struct nf_conntrack_nat_helper *nat; nat = nf_conntrack_nat_helper_find(helper->nat_mod_name); if (WARN_ON_ONCE(!nat)) return; module_put(nat->module); } EXPORT_SYMBOL_GPL(nf_nat_helper_put); struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); if (help) INIT_HLIST_HEAD(&help->expectations); else pr_debug("failed to add helper extension area"); return help; } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; /* We already got a helper explicitly attached (e.g. nft_ct) */ if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; if (WARN_ON_ONCE(!tmpl)) return 0; help = nfct_help(tmpl); if (help != NULL) { helper = rcu_dereference(help->helper); set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); return 0; } if (help == NULL) { help = nf_ct_helper_ext_add(ct, flags); if (help == NULL) return -ENOMEM; } else { /* We only allow helper re-assignment of the same sort since * we cannot reallocate the helper extension area. */ struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); if (tmp && tmp->help != helper->help) { RCU_INIT_POINTER(help->helper, NULL); return 0; } } rcu_assign_pointer(help->helper, helper); return 0; } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { struct nf_conn_help *help = nfct_help(ct); if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } /* We are not intended to delete this conntrack. */ return 0; } void nf_ct_helper_destroy(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; if (help) { rcu_read_lock(); helper = rcu_dereference(help->helper); if (helper && helper->destroy) helper->destroy(ct); rcu_read_unlock(); } } static LIST_HEAD(nf_ct_helper_expectfn_list); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) { spin_lock_bh(&nf_conntrack_expect_lock); list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) { spin_lock_bh(&nf_conntrack_expect_lock); list_del_rcu(&n->head); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); /* Caller should hold the rcu lock */ struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_name(const char *name) { struct nf_ct_helper_expectfn *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { if (!strcmp(cur->name, name)) { found = true; break; } } return found ? cur : NULL; } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); /* Caller should hold the rcu lock */ struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_symbol(const void *symbol) { struct nf_ct_helper_expectfn *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { if (cur->expectfn == symbol) { found = true; break; } } return found ? cur : NULL; } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); __printf(3, 4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...) { const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; /* Called from the helper function, this call never fails */ help = nfct_help(ct); /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_ct_helper_log); int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); if (!nf_ct_helper_hash) return -ENOENT; if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; mutex_lock(&nf_ct_helper_mutex); for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { if (!strcmp(cur->name, me->name) && (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { ret = -EEXIST; goto out; } } } /* avoid unpredictable behaviour for auto_assign_helper */ if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { ret = -EEXIST; goto out; } } } refcount_set(&me->refcnt, 1); hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; out: mutex_unlock(&nf_ct_helper_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) { struct nf_conn_help *help = nfct_help(exp->master); const struct nf_conntrack_helper *me = data; const struct nf_conntrack_helper *this; if (exp->helper == me) return true; this = rcu_dereference_protected(help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) { mutex_lock(&nf_ct_helper_mutex); hlist_del_rcu(&me->hnode); nf_ct_helper_count--; mutex_unlock(&nf_ct_helper_mutex); /* Make sure every nothing is still using the helper unless its a * connection in the hash. */ synchronize_rcu(); nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct), struct module *module) { helper->tuple.src.l3num = l3num; helper->tuple.dst.protonum = protonum; helper->tuple.src.u.all = htons(spec_port); helper->expect_policy = exp_pol; helper->expect_class_max = expect_class_max; helper->help = help; helper->from_nlattr = from_nlattr; helper->me = module; snprintf(helper->nat_mod_name, sizeof(helper->nat_mod_name), NF_NAT_HELPER_PREFIX "%s", name); if (spec_port == default_port) snprintf(helper->name, sizeof(helper->name), "%s", name); else snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id); } EXPORT_SYMBOL_GPL(nf_ct_helper_init); int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = nf_conntrack_helper_register(&helper[i]); if (err < 0) goto err; } return err; err: if (i > 0) nf_conntrack_helpers_unregister(helper, i); return err; } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, unsigned int n) { while (n-- > 0) nf_conntrack_helper_unregister(&helper[n]); } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat) { mutex_lock(&nf_ct_nat_helpers_mutex); list_add_rcu(&nat->list, &nf_ct_nat_helpers); mutex_unlock(&nf_ct_nat_helpers_mutex); } EXPORT_SYMBOL_GPL(nf_nat_helper_register); void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) { mutex_lock(&nf_ct_nat_helpers_mutex); list_del_rcu(&nat->list); mutex_unlock(&nf_ct_nat_helpers_mutex); } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; INIT_LIST_HEAD(&nf_ct_nat_helpers); return 0; } void nf_conntrack_helper_fini(void) { kvfree(nf_ct_helper_hash); nf_ct_helper_hash = NULL; }
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0 /* * super.c * * Copyright (c) 1999 Al Smith * * Portions derived from work (c) 1995,1996 Christian Vogelgsang. */ #include <linux/init.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/blkdev.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); static int efs_fill_super(struct super_block *s, void *d, int silent); static struct dentry *efs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); } static void efs_kill_sb(struct super_block *s) { struct efs_sb_info *sbi = SUPER_INFO(s); kill_block_super(s); kfree(sbi); } static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", .mount = efs_mount, .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("efs"); static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, {0x01, "SGI trkrepl"}, {0x02, "SGI secrepl"}, {0x03, "SGI raw"}, {0x04, "SGI bsd"}, {SGI_SYSV, "SGI sysv"}, {0x06, "SGI vol"}, {SGI_EFS, "SGI efs"}, {0x08, "SGI lv"}, {0x09, "SGI rlv"}, {0x0A, "SGI xfs"}, {0x0B, "SGI xfslog"}, {0x0C, "SGI xlv"}, {0x82, "Linux swap"}, {0x83, "Linux native"}, {0, NULL} }; static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void efs_free_inode(struct inode *inode) { kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(efs_inode_cachep); } static int efs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); *flags |= SB_RDONLY; return 0; } static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .free_inode = efs_free_inode, .statfs = efs_statfs, .remount_fs = efs_remount, }; static const struct export_operations efs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = efs_fh_to_dentry, .fh_to_parent = efs_fh_to_parent, .get_parent = efs_get_parent, }; static int __init init_efs_fs(void) { int err; pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; err = register_filesystem(&efs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_efs_fs(void) { unregister_filesystem(&efs_fs_type); destroy_inodecache(); } module_init(init_efs_fs) module_exit(exit_efs_fs) static efs_block_t efs_validate_vh(struct volume_header *vh) { int i; __be32 cs, *ui; int csum; efs_block_t sblock = 0; /* shuts up gcc */ struct pt_types *pt_entry; int pt_type, slice = -1; if (be32_to_cpu(vh->vh_magic) != VHMAGIC) { /* * assume that we're dealing with a partition and allow * read_super() to try and detect a valid superblock * on the next block. */ return 0; } ui = ((__be32 *) (vh + 1)) - 1; for(csum = 0; ui >= ((__be32 *) vh);) { cs = *ui--; csum += be32_to_cpu(cs); } if (csum) { pr_warn("SGI disklabel: checksum bad, label corrupted\n"); return 0; } #ifdef DEBUG pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); for(i = 0; i < NVDIR; i++) { int j; char name[VDNAMESIZE+1]; for(j = 0; j < VDNAMESIZE; j++) { name[j] = vh->vh_vd[i].vd_name[j]; } name[j] = (char) 0; if (name[0]) { pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); } } #endif for(i = 0; i < NPARTAB; i++) { pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type); for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) { if (pt_type == pt_entry->pt_type) break; } #ifdef DEBUG if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), pt_type, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); } #endif if (IS_EFS(pt_type)) { sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn); slice = i; } } if (slice == -1) { pr_notice("partition table contained no EFS partitions\n"); #ifdef DEBUG } else { pr_info("using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif } return sblock; } static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic))) return -1; sb->fs_magic = be32_to_cpu(super->fs_magic); sb->total_blocks = be32_to_cpu(super->fs_size); sb->first_block = be32_to_cpu(super->fs_firstcg); sb->group_size = be32_to_cpu(super->fs_cgfsize); sb->data_free = be32_to_cpu(super->fs_tfree); sb->inode_free = be32_to_cpu(super->fs_tinode); sb->inode_blocks = be16_to_cpu(super->fs_cgisize); sb->total_groups = be16_to_cpu(super->fs_ncg); return 0; } static int efs_fill_super(struct super_block *s, void *d, int silent) { struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; s->s_time_min = 0; s->s_time_max = U32_MAX; s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } /* read the vh (volume header) block */ bh = sb_bread(s, 0); if (!bh) { pr_err("cannot read volume header\n"); return -EIO; } /* * if this returns zero then we didn't find any partition table. * this isn't (yet) an error - just assume for the moment that * the device is valid and go on to search for a superblock. */ sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data); brelse(bh); if (sb->fs_start == -1) { return -EINVAL; } bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { pr_err("cannot read superblock\n"); return -EIO; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG pr_warn("invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); #endif brelse(bh); return -EINVAL; } brelse(bh); if (!sb_rdonly(s)) { #ifdef DEBUG pr_info("forcing read-only mode\n"); #endif s->s_flags |= SB_RDONLY; } s->s_op = &efs_superblock_operations; s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { pr_err("get root inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { pr_err("get root dentry failed\n"); return -ENOMEM; } return 0; } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct efs_sb_info *sbi = SUPER_INFO(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ buf->f_blocks = sbi->total_groups * /* total data blocks */ (sbi->group_size - sbi->inode_blocks); buf->f_bfree = sbi->data_free; /* free data blocks */ buf->f_bavail = sbi->data_free; /* free blocks for non-root */ buf->f_files = sbi->total_groups * /* total inodes */ sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sbi->inode_free; /* free inodes */ buf->f_fsid = u64_to_fsid(id); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; }
81 76 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul_1.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; /* The loop counter and index J goes from -S1_SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; /* Offset the base pointers to compensate for the negative indices. */ s1_ptr -= j; res_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; res_ptr[j] = prod_low; } while (++j); return cy_limb; }
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015 Nobuo Iwata */ #include <linux/kthread.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "usbip_common.h" struct usbip_event { struct list_head node; struct usbip_device *ud; }; static DEFINE_SPINLOCK(event_lock); static LIST_HEAD(event_list); static void set_event(struct usbip_device *ud, unsigned long event) { unsigned long flags; spin_lock_irqsave(&ud->lock, flags); ud->event |= event; spin_unlock_irqrestore(&ud->lock, flags); } static void unset_event(struct usbip_device *ud, unsigned long event) { unsigned long flags; spin_lock_irqsave(&ud->lock, flags); ud->event &= ~event; spin_unlock_irqrestore(&ud->lock, flags); } static struct usbip_device *get_event(void) { struct usbip_event *ue = NULL; struct usbip_device *ud = NULL; unsigned long flags; spin_lock_irqsave(&event_lock, flags); if (!list_empty(&event_list)) { ue = list_first_entry(&event_list, struct usbip_event, node); list_del(&ue->node); } spin_unlock_irqrestore(&event_lock, flags); if (ue) { ud = ue->ud; kfree(ue); } return ud; } static struct task_struct *worker_context; static void event_handler(struct work_struct *work) { struct usbip_device *ud; if (worker_context == NULL) { worker_context = current; } while ((ud = get_event()) != NULL) { usbip_dbg_eh("pending event %lx\n", ud->event); mutex_lock(&ud->sysfs_lock); /* * NOTE: shutdown must come first. * Shutdown the device. */ if (ud->event & USBIP_EH_SHUTDOWN) { ud->eh_ops.shutdown(ud); unset_event(ud, USBIP_EH_SHUTDOWN); } /* Reset the device. */ if (ud->event & USBIP_EH_RESET) { ud->eh_ops.reset(ud); unset_event(ud, USBIP_EH_RESET); } /* Mark the device as unusable. */ if (ud->event & USBIP_EH_UNUSABLE) { ud->eh_ops.unusable(ud); unset_event(ud, USBIP_EH_UNUSABLE); } mutex_unlock(&ud->sysfs_lock); wake_up(&ud->eh_waitq); } } int usbip_start_eh(struct usbip_device *ud) { init_waitqueue_head(&ud->eh_waitq); ud->event = 0; return 0; } EXPORT_SYMBOL_GPL(usbip_start_eh); void usbip_stop_eh(struct usbip_device *ud) { unsigned long pending = ud->event & ~USBIP_EH_BYE; if (!(ud->event & USBIP_EH_BYE)) usbip_dbg_eh("usbip_eh stopping but not removed\n"); if (pending) usbip_dbg_eh("usbip_eh waiting completion %lx\n", pending); wait_event_interruptible(ud->eh_waitq, !(ud->event & ~USBIP_EH_BYE)); usbip_dbg_eh("usbip_eh has stopped\n"); } EXPORT_SYMBOL_GPL(usbip_stop_eh); #define WORK_QUEUE_NAME "usbip_event" static struct workqueue_struct *usbip_queue; static DECLARE_WORK(usbip_work, event_handler); int usbip_init_eh(void) { usbip_queue = create_singlethread_workqueue(WORK_QUEUE_NAME); if (usbip_queue == NULL) { pr_err("failed to create usbip_event\n"); return -ENOMEM; } return 0; } void usbip_finish_eh(void) { destroy_workqueue(usbip_queue); usbip_queue = NULL; } void usbip_event_add(struct usbip_device *ud, unsigned long event) { struct usbip_event *ue; unsigned long flags; if (ud->event & USBIP_EH_BYE) return; set_event(ud, event); spin_lock_irqsave(&event_lock, flags); list_for_each_entry_reverse(ue, &event_list, node) { if (ue->ud == ud) goto out; } ue = kmalloc(sizeof(struct usbip_event), GFP_ATOMIC); if (ue == NULL) goto out; ue->ud = ud; list_add_tail(&ue->node, &event_list); queue_work(usbip_queue, &usbip_work); out: spin_unlock_irqrestore(&event_lock, flags); } EXPORT_SYMBOL_GPL(usbip_event_add); int usbip_event_happened(struct usbip_device *ud) { int happened = 0; unsigned long flags; spin_lock_irqsave(&ud->lock, flags); if (ud->event != 0) happened = 1; spin_unlock_irqrestore(&ud->lock, flags); return happened; } EXPORT_SYMBOL_GPL(usbip_event_happened); int usbip_in_eh(struct task_struct *task) { if (task == worker_context) return 1; return 0; } EXPORT_SYMBOL_GPL(usbip_in_eh);
11 3 3 13 2 2 3 2 1 1 13 4 4 4 6 6 4 10 3 1 1 1 1 1 27 11 11 16 16 4 6 6 3 1 1 1 3 250 250 76 200 9 9 1 1 7 5 5 1 1 2 1 8 8 1 7 4 4 1 2 1 16 12 12 3 1 1 2 10 4 4 4 14 1 15 4 10 7 1 6 6 6 4 6 3 3 6 6 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 1 1 1 1 3 1 2 4 1 1 1 2 3 1 2 8 1 4 1 1 1 4 1 3 17 1 10 4 2 5 2 3 3 1 1 3 1 1 1 1 1 13 13 3 3 8 8 23 1 1 6 1 5 3 1 3 4 4 4 3 1 2 2 2 3 1 2 1 1 3 4 4 1 1 1 1 1 1 4 4 2 2 1 1 1 1 1 1 1 1 1 1 1 1 33 1 30 1 1 1 1 1 1 1 1 2 1 1 3 1 1 1 4 150 100 183 64 247 37 8 4 16 2 1 1 3 1 2 1 1 1 3 1 1 3 5 3 8 1 3 1 13 1 6 3 1 3 1 1 4 1 7 1 1 2 1 2 3 4 17 1 1 1 4 1 1 1 1 4 33 9 5 1 1 2 3 248 246 183 64 254 2 248 4 252 136 92 2 2 14 1 247 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/ethtool.c - Ethtool ioctl handler * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> * * This file is where we call all the ethtool_ops commands to get * the information ethtool needs. */ #include <linux/compat.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/sfp.h> #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> #include <linux/pm_runtime.h> #include <net/devlink.h> #include <net/ipv6.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> #include <generated/utsrelease.h> #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ struct ethtool_devlink_compat { struct devlink *devlink; union { struct ethtool_flash efl; struct ethtool_drvinfo info; }; }; static struct devlink *netdev_to_devlink_get(struct net_device *dev) { if (!dev->devlink_port) return NULL; return devlink_try_get(dev->devlink_port->devlink); } /* * Some useful ethtool_ops methods that're device independent. * If we find that all drivers want to do the same thing here, * we can turn these into dev_() function calls. */ u32 ethtool_op_get_link(struct net_device *dev) { /* Synchronize carrier state with link watch, see also rtnl_getlink() */ linkwatch_sync_dev(dev); return netif_carrier_ok(dev) ? 1 : 0; } EXPORT_SYMBOL(ethtool_op_get_link); int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; return 0; } EXPORT_SYMBOL(ethtool_op_get_ts_info); /* Handlers for each ethtool command */ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { .cmd = ETHTOOL_GFEATURES, .size = ETHTOOL_DEV_FEATURE_WORDS, }; struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; u32 __user *sizeaddr; u32 copy_size; int i; /* in case feature bits run out again */ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { features[i].available = (u32)(dev->hw_features >> (32 * i)); features[i].requested = (u32)(dev->wanted_features >> (32 * i)); features[i].active = (u32)(dev->features >> (32 * i)); features[i].never_changed = (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); } sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); if (get_user(copy_size, sizeaddr)) return -EFAULT; if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) copy_size = ETHTOOL_DEV_FEATURE_WORDS; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (copy_to_user(useraddr, features, array_size(copy_size, sizeof(*features)))) return -EFAULT; return 0; } static int ethtool_set_features(struct net_device *dev, void __user *useraddr) { struct ethtool_sfeatures cmd; struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; netdev_features_t wanted = 0, valid = 0; int i, ret = 0; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) return -EINVAL; if (copy_from_user(features, useraddr, sizeof(features))) return -EFAULT; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { valid |= (netdev_features_t)features[i].valid << (32 * i); wanted |= (netdev_features_t)features[i].requested << (32 * i); } if (valid & ~NETIF_F_ETHTOOL_BITS) return -EINVAL; if (valid & ~dev->hw_features) { valid &= dev->hw_features; ret |= ETHTOOL_F_UNSUPPORTED; } dev->wanted_features &= ~valid; dev->wanted_features |= wanted & valid; __netdev_update_features(dev); if ((dev->wanted_features ^ dev->features) & valid) ret |= ETHTOOL_F_WISH; return ret; } static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); if (sset == ETH_SS_PHY_TUNABLES) return ARRAY_SIZE(phy_tunable_strings); if (sset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) return phy_ops->get_sset_count(dev->phydev); if (sset == ETH_SS_LINK_MODES) return __ETHTOOL_LINK_MODE_MASK_NBITS; if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else return -EOPNOTSUPP; } static void __ethtool_get_strings(struct net_device *dev, u32 stringset, u8 *data) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) phy_ops->get_strings(dev->phydev, data); else if (stringset == ETH_SS_LINK_MODES) memcpy(data, link_mode_names, __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) { /* feature masks of legacy discrete ethtool ops */ switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; case ETHTOOL_GSG: case ETHTOOL_SSG: return NETIF_F_SG | NETIF_F_FRAGLIST; case ETHTOOL_GTSO: case ETHTOOL_STSO: return NETIF_F_ALL_TSO; case ETHTOOL_GGSO: case ETHTOOL_SGSO: return NETIF_F_GSO; case ETHTOOL_GGRO: case ETHTOOL_SGRO: return NETIF_F_GRO; default: BUG(); } } static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { netdev_features_t mask = ethtool_get_feature_mask(ethcmd); struct ethtool_value edata = { .cmd = ethcmd, .data = !!(dev->features & mask), }; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_one_feature(struct net_device *dev, void __user *useraddr, u32 ethcmd) { struct ethtool_value edata; netdev_features_t mask; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; mask = ethtool_get_feature_mask(ethcmd); mask &= dev->hw_features; if (!mask) return -EOPNOTSUPP; if (edata.data) dev->wanted_features |= mask; else dev->wanted_features &= ~mask; __netdev_update_features(dev); return 0; } #define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) #define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ NETIF_F_RXHASH) static u32 __ethtool_get_flags(struct net_device *dev) { u32 flags = 0; if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; return flags; } static int __ethtool_set_flags(struct net_device *dev, u32 data) { netdev_features_t features = 0, changed; if (data & ~ETH_ALL_FLAGS) return -EINVAL; if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ changed = (features ^ dev->features) & ETH_ALL_FEATURES; if (changed & ~dev->hw_features) return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; dev->wanted_features = (dev->wanted_features & ~changed) | (features & changed); __netdev_update_features(dev); return 0; } /* Given two link masks, AND them together and save the result in dst. */ void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, struct ethtool_link_ksettings *src) { unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int idx = 0; for (; idx < size; idx++) { dst->link_modes.supported[idx] &= src->link_modes.supported[idx]; dst->link_modes.advertising[idx] &= src->link_modes.advertising[idx]; } } EXPORT_SYMBOL(ethtool_intersect_link_masks); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { linkmode_zero(dst); dst[0] = legacy_u32; } EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); /* return false if src had higher bits set. lower bits always updated. */ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, const unsigned long *src) { *legacy_u32 = src[0]; return find_next_bit(src, __ETHTOOL_LINK_MODE_MASK_NBITS, 32) == __ETHTOOL_LINK_MODE_MASK_NBITS; } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ static bool convert_link_ksettings_to_legacy_settings( struct ethtool_cmd *legacy_settings, const struct ethtool_link_ksettings *link_ksettings) { bool retval = true; memset(legacy_settings, 0, sizeof(*legacy_settings)); /* this also clears the deprecated fields in legacy structure: * __u8 transceiver; * __u32 maxtxpkt; * __u32 maxrxpkt; */ retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->supported, link_ksettings->link_modes.supported); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->advertising, link_ksettings->link_modes.advertising); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->lp_advertising, link_ksettings->link_modes.lp_advertising); ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); legacy_settings->duplex = link_ksettings->base.duplex; legacy_settings->port = link_ksettings->base.port; legacy_settings->phy_address = link_ksettings->base.phy_address; legacy_settings->autoneg = link_ksettings->base.autoneg; legacy_settings->mdio_support = link_ksettings->base.mdio_support; legacy_settings->eth_tp_mdix = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; legacy_settings->transceiver = link_ksettings->base.transceiver; return retval; } /* number of 32-bit words to store the user's link mode bitmaps */ #define __ETHTOOL_LINK_MODE_MASK_NU32 \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) /* layout of the struct passed from/to userland */ struct ethtool_link_usettings { struct ethtool_link_settings base; struct { __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; } link_modes; }; /* Internal kernel helper to query a device ethtool_link_settings. */ int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; memset(link_ksettings, 0, sizeof(*link_ksettings)); return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } EXPORT_SYMBOL(__ethtool_get_link_ksettings); /* convert ethtool_link_usettings in user space to a kernel internal * ethtool_link_ksettings. return 0 on success, errno on error. */ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, const void __user *from) { struct ethtool_link_usettings link_usettings; if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) return -EFAULT; memcpy(&to->base, &link_usettings.base, sizeof(to->base)); bitmap_from_arr32(to->link_modes.supported, link_usettings.link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.advertising, link_usettings.link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.lp_advertising, link_usettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); return 0; } /* Check if the user is trying to change anything besides speed/duplex */ bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd) { struct ethtool_link_settings base2 = {}; base2.speed = cmd->base.speed; base2.port = PORT_OTHER; base2.duplex = cmd->base.duplex; base2.cmd = cmd->base.cmd; base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords; return !memcmp(&base2, &cmd->base, sizeof(base2)) && bitmap_empty(cmd->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS) && bitmap_empty(cmd->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); } /* convert a kernel internal ethtool_link_ksettings to * ethtool_link_usettings in user space. return 0 on success, errno on * error. */ static int store_link_ksettings_for_user(void __user *to, const struct ethtool_link_ksettings *from) { struct ethtool_link_usettings link_usettings; memcpy(&link_usettings, from, sizeof(link_usettings)); bitmap_to_arr32(link_usettings.link_modes.supported, from->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.advertising, from->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.lp_advertising, from->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) return -EFAULT; return 0; } /* Query device for its ethtool_link_settings. */ static int ethtool_get_link_ksettings(struct net_device *dev, void __user *useraddr) { int err = 0; struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; /* handle bitmap nbits handshake */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) { /* wrong link mode nbits requested */ memset(&link_ksettings, 0, sizeof(link_ksettings)); link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; /* send back number of words required as negative val */ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, "need too many bits for link modes!"); link_ksettings.base.link_mode_masks_nwords = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); /* copy the base fields back to user, not the link * mode bitmaps */ if (copy_to_user(useraddr, &link_ksettings.base, sizeof(link_ksettings.base))) return -EFAULT; return 0; } /* handshake successful: user/kernel agree on * link_mode_masks_nwords */ memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; /* make sure we tell the right values to user */ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; link_ksettings.base.rate_matching = RATE_MATCH_NONE; return store_link_ksettings_for_user(useraddr, &link_ksettings); } /* Update device ethtool_link_settings. */ static int ethtool_set_link_ksettings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings = {}; int err; ASSERT_RTNL(); if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; /* make sure nbits field has expected value */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; /* copy the whole structure, now that we know it has expected * format */ err = load_link_ksettings_from_user(&link_ksettings, useraddr); if (err) return err; /* re-check nwords field, just in case */ if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; if (link_ksettings.base.master_slave_cfg || link_ksettings.base.master_slave_state) return -EINVAL; err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } return err; } int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex) { u32 speed; u8 duplex; speed = cmd->base.speed; duplex = cmd->base.duplex; /* don't allow custom speed and duplex */ if (!ethtool_validate_speed(speed) || !ethtool_validate_duplex(duplex) || !ethtool_virtdev_validate_cmd(cmd)) return -EINVAL; *dev_speed = speed; *dev_duplex = duplex; return 0; } EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings); /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, this is * now implemented via get_link_ksettings. When driver reports higher link mode * bits, a kernel warning is logged once (with name of 1st driver/device) to * recommend user to upgrade ethtool, but the command is successful (only the * lower link mode bits reported back to user). Deprecated fields from * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; int err; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; return 0; } /* Update device link settings with given ethtool_cmd. * * Backward compatibility note: for compatibility with legacy ethtool, this is * now always implemented via set_link_settings. When user's request updates * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel * warning is logged once (with name of 1st driver/device) to recommend user to * upgrade ethtool, and the request is rejected. */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; int ret; ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) return -EINVAL; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (ret >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } return ret; } static int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp) { const struct ethtool_ops *ops = dev->ethtool_ops; struct device *parent = dev->dev.parent; rsp->info.cmd = ETHTOOL_GDRVINFO; strscpy(rsp->info.version, UTS_RELEASE, sizeof(rsp->info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &rsp->info); if (!rsp->info.bus_info[0] && parent) strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); if (!rsp->info.driver[0] && parent && parent->driver) strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (parent && parent->driver) { strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (dev->rtnl_link_ops) { strscpy(rsp->info.driver, dev->rtnl_link_ops->kind, sizeof(rsp->info.driver)); } else { return -EOPNOTSUPP; } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ if (ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); if (rc >= 0) rsp->info.testinfo_len = rc; rc = ops->get_sset_count(dev, ETH_SS_STATS); if (rc >= 0) rsp->info.n_stats = rc; rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) rsp->info.n_priv_flags = rc; } if (ops->get_regs_len) { int ret = ops->get_regs_len(dev); if (ret > 0) rsp->info.regdump_len = ret; } if (ops->get_eeprom_len) rsp->info.eedump_len = ops->get_eeprom_len(dev); if (!rsp->info.fw_version[0]) rsp->devlink = netdev_to_devlink_get(dev); return 0; } static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, void __user *useraddr) { struct ethtool_sset_info info; u64 sset_mask; int i, idx = 0, n_bits = 0, ret, rc; u32 *info_buf = NULL; if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; /* store copy of mask, because we zero struct later on */ sset_mask = info.sset_mask; if (!sset_mask) return 0; /* calculate size of return buffer */ n_bits = hweight64(sset_mask); memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GSSET_INFO; info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); if (!info_buf) return -ENOMEM; /* * fill return buffer based on input bitmask and successful * get_sset_count return */ for (i = 0; i < 64; i++) { if (!(sset_mask & (1ULL << i))) continue; rc = __ethtool_get_sset_count(dev, i); if (rc >= 0) { info.sset_mask |= (1ULL << i); info_buf[idx++] = rc; } } ret = -EFAULT; if (copy_to_user(useraddr, &info, sizeof(info))) goto out; useraddr += offsetof(struct ethtool_sset_info, data); if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32)))) goto out; ret = 0; out: kfree(info_buf); return ret; } static noinline_for_stack int ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc, const struct compat_ethtool_rxnfc __user *useraddr, size_t size) { struct compat_ethtool_rxnfc crxnfc = {}; /* We expect there to be holes between fs.m_ext and * fs.ring_cookie and at the end of fs, but nowhere else. * On non-x86, no conversion should be needed. */ BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) && sizeof(struct compat_ethtool_rxnfc) != sizeof(struct ethtool_rxnfc)); BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + sizeof(useraddr->fs.m_ext) != offsetof(struct ethtool_rxnfc, fs.m_ext) + sizeof(rxnfc->fs.m_ext)); BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != offsetof(struct ethtool_rxnfc, fs.location) - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc)))) return -EFAULT; *rxnfc = (struct ethtool_rxnfc) { .cmd = crxnfc.cmd, .flow_type = crxnfc.flow_type, .data = crxnfc.data, .fs = { .flow_type = crxnfc.fs.flow_type, .h_u = crxnfc.fs.h_u, .h_ext = crxnfc.fs.h_ext, .m_u = crxnfc.fs.m_u, .m_ext = crxnfc.fs.m_ext, .ring_cookie = crxnfc.fs.ring_cookie, .location = crxnfc.fs.location, }, .rule_cnt = crxnfc.rule_cnt, }; return 0; } static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc, const void __user *useraddr, size_t size) { if (compat_need_64bit_alignment_fixup()) return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size); if (copy_from_user(rxnfc, useraddr, size)) return -EFAULT; return 0; } static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) { struct compat_ethtool_rxnfc crxnfc; memset(&crxnfc, 0, sizeof(crxnfc)); crxnfc = (struct compat_ethtool_rxnfc) { .cmd = rxnfc->cmd, .flow_type = rxnfc->flow_type, .data = rxnfc->data, .fs = { .flow_type = rxnfc->fs.flow_type, .h_u = rxnfc->fs.h_u, .h_ext = rxnfc->fs.h_ext, .m_u = rxnfc->fs.m_u, .m_ext = rxnfc->fs.m_ext, .ring_cookie = rxnfc->fs.ring_cookie, .location = rxnfc->fs.location, }, .rule_cnt = rxnfc->rule_cnt, }; if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc)))) return -EFAULT; return 0; } static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info, size_t *info_size, void __user *useraddr) { /* struct ethtool_rxnfc was originally defined for * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data * members. User-space might still be using that * definition. */ if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) *info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info->data)); if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) return -EFAULT; if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) { *info_size = sizeof(*info); if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. */ if (!(info->flow_type & FLOW_RSS)) return -EINVAL; } if (info->cmd != cmd) return -EINVAL; return 0; } static int ethtool_rxnfc_copy_to_user(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) { int ret; if (compat_need_64bit_alignment_fixup()) { ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size, rule_buf); useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs); } else { ret = copy_to_user(useraddr, rxnfc, size); useraddr += offsetof(struct ethtool_rxnfc, rule_locs); } if (ret) return -EFAULT; if (rule_buf) { if (copy_to_user(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32))) return -EFAULT; } return 0; } static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size = sizeof(info); int rc; if (!ops->set_rxnfc) return -EOPNOTSUPP; rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (rc) return rc; if (ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); if (rc) return rc; /* Sanity check: if symmetric-xor is set, then: * 1 - no other fields besides IP src/dst and/or L4 src/dst * 2 - If src is set, dst must also be set */ if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) return -EINVAL; } rc = ops->set_rxnfc(dev, &info); if (rc) return rc; if (cmd == ETHTOOL_SRXCLSRLINS && ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL)) return -EFAULT; return 0; } static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { struct ethtool_rxnfc info; size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; int ret; void *rule_buf = NULL; if (!ops->get_rxnfc) return -EOPNOTSUPP; ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) rule_buf = kcalloc(info.rule_cnt, sizeof(u32), GFP_USER); if (!rule_buf) return -ENOMEM; } } ret = ops->get_rxnfc(dev, &info, rule_buf); if (ret < 0) goto err_out; ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); err_out: kfree(rule_buf); return ret; } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, struct ethtool_rxnfc *rx_rings, u32 size) { int i; if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0])))) return -EFAULT; /* Validate ring indices */ for (i = 0; i < size; i++) if (indir[i] >= rx_rings->data) return -EINVAL; return 0; } u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { BUG_ON(len > sizeof(netdev_rss_key)); net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); memcpy(buffer, netdev_rss_key, len); } EXPORT_SYMBOL(netdev_rss_key_fill); static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { struct ethtool_rxfh_param rxfh = {}; u32 user_size; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (rxfh.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), &rxfh.indir_size, sizeof(rxfh.indir_size))) return -EFAULT; /* If the user buffer size is 0, this is just a query for the * device table size. Otherwise, if it's smaller than the * device table size it's an error. */ if (user_size < rxfh.indir_size) return user_size == 0 ? 0 : -EINVAL; rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER); if (!rxfh.indir) return -ENOMEM; ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, ring_index[0]), rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir))) ret = -EFAULT; out: kfree(rxfh.indir); return ret; } static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); if (!ops->get_rxfh_indir_size || !ops->set_rxfh || !ops->get_rxnfc) return -EOPNOTSUPP; rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (rxfh_dev.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (user_size != 0 && user_size != rxfh_dev.indir_size) return -EINVAL; rxfh_dev.indir = kcalloc(rxfh_dev.indir_size, sizeof(rxfh_dev.indir[0]), GFP_USER); if (!rxfh_dev.indir) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; if (user_size == 0) { u32 *indir = rxfh_dev.indir; for (i = 0; i < rxfh_dev.indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, &rx_rings, rxfh_dev.indir_size); if (ret) goto out; } rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE; ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; /* indicate whether rxfh was set to default */ if (user_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rxfh_dev.indir); return ret; } static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; u32 user_indir_size, user_key_size; struct ethtool_rxfh rxfh; u32 indir_bytes; u8 *rss_config; u32 total_size; int ret; if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) rxfh_dev.key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; rxfh.indir_size = rxfh_dev.indir_size; rxfh.key_size = rxfh_dev.key_size; if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) || (user_key_size && user_key_size != rxfh_dev.key_size)) return -EINVAL; indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]); total_size = indir_bytes + user_key_size; rss_config = kzalloc(total_size, GFP_USER); if (!rss_config) return -ENOMEM; if (user_indir_size) rxfh_dev.indir = (u32 *)rss_config; if (user_key_size) rxfh_dev.key = rss_config + indir_bytes; rxfh_dev.rss_context = rxfh.rss_context; ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, input_xfrm), &rxfh_dev.input_xfrm, sizeof(rxfh.input_xfrm))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_config[0]), rss_config, total_size)) { ret = -EFAULT; } out: kfree(rss_config); return ret; } static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, void __user *useraddr) { u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; u32 dev_indir_size = 0, dev_key_size = 0, i; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; u32 indir_bytes = 0; u8 *rss_config; int ret; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && !ops->cap_rss_sym_xor_supported) return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. * Must request at least one change: indir size, hash key, function * or input transformation. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE && rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); if (!rss_config) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, &rx_rings, rxfh.indir_size); if (ret) goto out; } else if (rxfh.indir_size == 0) { if (rxfh.rss_context == 0) { u32 *indir; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { rxfh_dev.rss_delete = true; } } if (rxfh.key_size) { rxfh_dev.key_size = dev_key_size; rxfh_dev.key = rss_config + indir_bytes; if (copy_from_user(rxfh_dev.key, useraddr + rss_cfg_offset + indir_bytes, rxfh.key_size)) { ret = -EFAULT; goto out; } } rxfh_dev.hfunc = rxfh.hfunc; rxfh_dev.rss_context = rxfh.rss_context; rxfh_dev.input_xfrm = rxfh.input_xfrm; ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context))) ret = -EFAULT; if (!rxfh_dev.rss_context) { /* indicate whether rxfh was set to default */ if (rxfh.indir_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) dev->priv_flags |= IFF_RXFH_CONFIGURED; } out: kfree(rss_config); return ret; } static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; const struct ethtool_ops *ops = dev->ethtool_ops; void *regbuf; int reglen, ret; if (!ops->get_regs || !ops->get_regs_len) return -EOPNOTSUPP; if (copy_from_user(&regs, useraddr, sizeof(regs))) return -EFAULT; reglen = ops->get_regs_len(dev); if (reglen <= 0) return reglen; if (regs.len > reglen) regs.len = reglen; regbuf = vzalloc(reglen); if (!regbuf) return -ENOMEM; if (regs.len < reglen) reglen = regs.len; ops->get_regs(dev, &regs, regbuf); ret = -EFAULT; if (copy_to_user(useraddr, &regs, sizeof(regs))) goto out; useraddr += offsetof(struct ethtool_regs, data); if (copy_to_user(useraddr, regbuf, reglen)) goto out; ret = 0; out: vfree(regbuf); return ret; } static int ethtool_reset(struct net_device *dev, char __user *useraddr) { struct ethtool_value reset; int ret; if (!dev->ethtool_ops->reset) return -EOPNOTSUPP; if (copy_from_user(&reset, useraddr, sizeof(reset))) return -EFAULT; ret = dev->ethtool_ops->reset(dev, &reset.data); if (ret) return ret; if (copy_to_user(useraddr, &reset, sizeof(reset))) return -EFAULT; return 0; } static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; memset(&wol, 0, sizeof(struct ethtool_wolinfo)); wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &wol); if (copy_to_user(useraddr, &wol, sizeof(wol))) return -EFAULT; return 0; } static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol, cur_wol; int ret; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) return -EOPNOTSUPP; memset(&cur_wol, 0, sizeof(struct ethtool_wolinfo)); cur_wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &cur_wol); if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; if (wol.wolopts & ~cur_wol.supported) return -EINVAL; if (wol.wolopts == cur_wol.wolopts && !memcmp(wol.sopass, cur_wol.sopass, sizeof(wol.sopass))) return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->wol_enabled = !!wol.wolopts; ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); return 0; } static void eee_to_keee(struct ethtool_keee *keee, const struct ethtool_eee *eee) { memset(keee, 0, sizeof(*keee)); keee->supported_u32 = eee->supported; keee->advertised_u32 = eee->advertised; keee->lp_advertised_u32 = eee->lp_advertised; keee->eee_active = eee->eee_active; keee->eee_enabled = eee->eee_enabled; keee->tx_lpi_enabled = eee->tx_lpi_enabled; keee->tx_lpi_timer = eee->tx_lpi_timer; ethtool_convert_legacy_u32_to_link_mode(keee->supported, eee->supported); ethtool_convert_legacy_u32_to_link_mode(keee->advertised, eee->advertised); ethtool_convert_legacy_u32_to_link_mode(keee->lp_advertised, eee->lp_advertised); } static void keee_to_eee(struct ethtool_eee *eee, const struct ethtool_keee *keee) { memset(eee, 0, sizeof(*eee)); eee->eee_active = keee->eee_active; eee->eee_enabled = keee->eee_enabled; eee->tx_lpi_enabled = keee->tx_lpi_enabled; eee->tx_lpi_timer = keee->tx_lpi_timer; if (ethtool_eee_use_linkmodes(keee)) { bool overflow; overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported, keee->supported); ethtool_convert_link_mode_to_legacy_u32(&eee->advertised, keee->advertised); ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised, keee->lp_advertised); if (overflow) pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n"); } else { eee->supported = keee->supported_u32; eee->advertised = keee->advertised_u32; eee->lp_advertised = keee->lp_advertised_u32; } } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_keee keee; struct ethtool_eee eee; int rc; if (!dev->ethtool_ops->get_eee) return -EOPNOTSUPP; memset(&keee, 0, sizeof(keee)); rc = dev->ethtool_ops->get_eee(dev, &keee); if (rc) return rc; keee_to_eee(&eee, &keee); if (copy_to_user(useraddr, &eee, sizeof(eee))) return -EFAULT; return 0; } static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_keee keee; struct ethtool_eee eee; int ret; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; if (copy_from_user(&eee, useraddr, sizeof(eee))) return -EFAULT; eee_to_keee(&keee, &eee); ret = dev->ethtool_ops->set_eee(dev, &keee); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); return ret; } static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) return -EOPNOTSUPP; return dev->ethtool_ops->nway_reset(dev); } static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; int link = __ethtool_get_link(dev); if (link < 0) return link; edata.data = link; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, int (*getter)(struct net_device *, struct ethtool_eeprom *, u8 *), u32 total_len) { struct ethtool_eeprom eeprom; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > total_len) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); ret = getter(dev, &eeprom, data); if (ret) break; if (!eeprom.len) { ret = -EIO; break; } if (copy_to_user(userbuf, data, eeprom.len)) { ret = -EFAULT; break; } userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } eeprom.len = userbuf - (useraddr + sizeof(eeprom)); eeprom.offset -= eeprom.len; if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) ret = -EFAULT; kfree(data); return ret; } static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, ops->get_eeprom_len(dev)); } static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; const struct ethtool_ops *ops = dev->ethtool_ops; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (!ops->set_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); if (copy_from_user(data, userbuf, eeprom.len)) { ret = -EFAULT; break; } ret = ops->set_eeprom(dev, &eeprom, data); if (ret) break; userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } kfree(data); return ret; } static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; struct kernel_ethtool_coalesce kernel_coalesce = {}; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (ret) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; return 0; } static bool ethtool_set_coalesce_supported(struct net_device *dev, struct ethtool_coalesce *coalesce) { u32 supported_params = dev->ethtool_ops->supported_coalesce_params; u32 nonzero_params = 0; if (coalesce->rx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_RX_USECS; if (coalesce->rx_max_coalesced_frames) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES; if (coalesce->rx_coalesce_usecs_irq) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ; if (coalesce->rx_max_coalesced_frames_irq) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ; if (coalesce->tx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_TX_USECS; if (coalesce->tx_max_coalesced_frames) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES; if (coalesce->tx_coalesce_usecs_irq) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ; if (coalesce->tx_max_coalesced_frames_irq) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ; if (coalesce->stats_block_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS; if (coalesce->use_adaptive_rx_coalesce) nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX; if (coalesce->use_adaptive_tx_coalesce) nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX; if (coalesce->pkt_rate_low) nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW; if (coalesce->rx_coalesce_usecs_low) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW; if (coalesce->rx_max_coalesced_frames_low) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW; if (coalesce->tx_coalesce_usecs_low) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW; if (coalesce->tx_max_coalesced_frames_low) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW; if (coalesce->pkt_rate_high) nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH; if (coalesce->rx_coalesce_usecs_high) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH; if (coalesce->rx_max_coalesced_frames_high) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH; if (coalesce->tx_coalesce_usecs_high) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH; if (coalesce->tx_max_coalesced_frames_high) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH; if (coalesce->rate_sample_interval) nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL; return (supported_params & nonzero_params) == nonzero_params; } static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce; int ret; if (!dev->ethtool_ops->set_coalesce || !dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (ret) return ret; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) return -EFAULT; if (!ethtool_set_coalesce_supported(dev, &coalesce)) return -EOPNOTSUPP; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); return ret; } static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam = {}; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) return -EFAULT; return 0; } static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || ringparam.rx_mini_pending > max.rx_mini_max_pending || ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || ringparam.tx_pending > max.tx_max_pending) return -EINVAL; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); return ret; } static noinline_for_stack int ethtool_get_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; dev->ethtool_ops->get_channels(dev, &channels); if (copy_to_user(useraddr, &channels, sizeof(channels))) return -EFAULT; return 0; } static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u16 from_channel, to_channel; u64 max_rxnfc_in_use; u32 max_rxfh_in_use; unsigned int i; int ret; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; dev->ethtool_ops->get_channels(dev, &curr); if (channels.rx_count == curr.rx_count && channels.tx_count == curr.tx_count && channels.combined_count == curr.combined_count && channels.other_count == curr.other_count) return 0; /* ensure new counts are within the maximums */ if (channels.rx_count > curr.max_rx || channels.tx_count > curr.max_tx || channels.combined_count > curr.max_combined || channels.other_count > curr.max_other) return -EINVAL; /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && (!channels.rx_count || !channels.tx_count)) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow * indirection table/rxnfc settings */ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) max_rxnfc_in_use = 0; if (!netif_is_rxfh_configured(dev) || ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) max_rxfh_in_use = 0; if (channels.combined_count + channels.rx_count <= max_t(u64, max_rxnfc_in_use, max_rxfh_in_use)) return -EINVAL; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); return ret; } static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM }; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; dev->ethtool_ops->get_pauseparam(dev, &pauseparam); if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) return -EFAULT; return 0; } static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam; int ret; if (!dev->ethtool_ops->set_pauseparam) return -EOPNOTSUPP; if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) return -EFAULT; ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); return ret; } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, test_len; if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); if (copy_from_user(&test, useraddr, sizeof(test))) return -EFAULT; test.len = test_len; data = kcalloc(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM; netif_testing_on(dev); ops->self_test(dev, &test, data); netif_testing_off(dev); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) goto out; useraddr += sizeof(test); if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64)))) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) { struct ethtool_gstrings gstrings; u8 *data; int ret; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; if (ret > S32_MAX / ETH_GSTRING_LEN) return -ENOMEM; WARN_ON_ONCE(!ret); gstrings.len = ret; if (gstrings.len) { data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); if (!data) return -ENOMEM; __ethtool_get_strings(dev, gstrings.string_set, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) goto out; useraddr += sizeof(gstrings); if (gstrings.len && copy_to_user(useraddr, data, array_size(gstrings.len, ETH_GSTRING_LEN))) goto out; ret = 0; out: vfree(data); return ret; } __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(*data, ETH_GSTRING_LEN, fmt, args); va_end(args); *data += ETH_GSTRING_LEN; } EXPORT_SYMBOL(ethtool_sprintf); void ethtool_puts(u8 **data, const char *str) { strscpy(*data, str, ETH_GSTRING_LEN); *data += ETH_GSTRING_LEN; } EXPORT_SYMBOL(ethtool_puts); static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; static bool busy; const struct ethtool_ops *ops = dev->ethtool_ops; netdevice_tracker dev_tracker; int rc; if (!ops->set_phys_id) return -EOPNOTSUPP; if (busy) return -EBUSY; if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); if (rc < 0) return rc; /* Drop the RTNL lock while waiting, but prevent reentry or * removal of the device. */ busy = true; netdev_hold(dev, &dev_tracker, GFP_KERNEL); rtnl_unlock(); if (rc == 0) { /* Driver will handle this itself */ schedule_timeout_interruptible( id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); } else { /* Driver expects to be called at twice the frequency in rc */ int n = rc * 2, interval = HZ / n; u64 count = mul_u32_u32(n, id.data); u64 i = 0; do { rtnl_lock(); rc = ops->set_phys_id(dev, (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); rtnl_unlock(); if (rc) break; schedule_timeout_interruptible(interval); } while (!signal_pending(current) && (!id.data || i < count)); } rtnl_lock(); netdev_put(dev, &dev_tracker); busy = false; (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); return rc; } static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) { struct ethtool_stats stats; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, n_stats; if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; if (n_stats) { data = vzalloc(array_size(n_stats, sizeof(u64))); if (!data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; out: vfree(data); return ret; } static int ethtool_vzalloc_stats_array(int n_stats, u64 **data) { if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; if (WARN_ON_ONCE(!n_stats)) return -EOPNOTSUPP; *data = vzalloc(array_size(n_stats, sizeof(u64))); if (!*data) return -ENOMEM; return 0; } static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, struct ethtool_stats *stats, u64 **data) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; int n_stats, ret; if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats) return -EOPNOTSUPP; n_stats = phy_ops->get_sset_count(phydev); ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) return ret; stats->n_stats = n_stats; return phy_ops->get_stats(phydev, stats, *data); } static int ethtool_get_phy_stats_ethtool(struct net_device *dev, struct ethtool_stats *stats, u64 **data) { const struct ethtool_ops *ops = dev->ethtool_ops; int n_stats, ret; if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) return ret; stats->n_stats = n_stats; ops->get_ethtool_phy_stats(dev, stats, *data); return 0; } static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_stats stats; u64 *data = NULL; int ret = -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; if (phydev) ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data); if (ret == -EOPNOTSUPP) ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data); if (ret) goto out; if (copy_to_user(useraddr, &stats, sizeof(stats))) { ret = -EFAULT; goto out; } useraddr += sizeof(stats); if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) return -EFAULT; if (epaddr.size < dev->addr_len) return -ETOOSMALL; epaddr.size = dev->addr_len; if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) return -EFAULT; useraddr += sizeof(epaddr); if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) return -EFAULT; return 0; } static int ethtool_get_value(struct net_device *dev, char __user *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { struct ethtool_value edata = { .cmd = cmd }; if (!actor) return -EOPNOTSUPP; edata.data = actor(dev); if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, void (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; actor(dev, edata.data); return 0; } static int ethtool_set_value(struct net_device *dev, char __user *useraddr, int (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; return actor(dev, edata.data); } static int ethtool_flash_device(struct net_device *dev, struct ethtool_devlink_compat *req) { if (!dev->ethtool_ops->flash_device) { req->devlink = netdev_to_devlink_get(dev); return 0; } return dev->ethtool_ops->flash_device(dev, &req->efl); } static int ethtool_set_dump(struct net_device *dev, void __user *useraddr) { struct ethtool_dump dump; if (!dev->ethtool_ops->set_dump) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; return dev->ethtool_ops->set_dump(dev, &dump); } static int ethtool_get_dump_flag(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_dump dump; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; ret = ops->get_dump_flag(dev, &dump); if (ret) return ret; if (copy_to_user(useraddr, &dump, sizeof(dump))) return -EFAULT; return 0; } static int ethtool_get_dump_data(struct net_device *dev, void __user *useraddr) { int ret; __u32 len; struct ethtool_dump dump, tmp; const struct ethtool_ops *ops = dev->ethtool_ops; void *data = NULL; if (!ops->get_dump_data || !ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); tmp.cmd = ETHTOOL_GET_DUMP_FLAG; ret = ops->get_dump_flag(dev, &tmp); if (ret) return ret; len = min(tmp.len, dump.len); if (!len) return -EFAULT; /* Don't ever let the driver think there's more space available * than it requested with .get_dump_flag(). */ dump.len = len; /* Always allocate enough space to hold the whole thing so that the * driver does not need to check the length and bother with partial * dumping. */ data = vzalloc(tmp.len); if (!data) return -ENOMEM; ret = ops->get_dump_data(dev, &dump, data); if (ret) goto out; /* There are two sane possibilities: * 1. The driver's .get_dump_data() does not touch dump.len. * 2. Or it may set dump.len to how much it really writes, which * should be tmp.len (or len if it can do a partial dump). * In any case respond to userspace with the actual length of data * it's receiving. */ WARN_ON(dump.len != len && dump.len != tmp.len); dump.len = len; if (copy_to_user(useraddr, &dump, sizeof(dump))) { ret = -EFAULT; goto out; } useraddr += offsetof(struct ethtool_dump, data); if (copy_to_user(useraddr, data, len)) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) { struct ethtool_ts_info info; int err; err = __ethtool_get_ts_info(dev, &info); if (err) return err; if (copy_to_user(useraddr, &info, sizeof(info))) return -EFAULT; return 0; } int ethtool_get_module_info_call(struct net_device *dev, struct ethtool_modinfo *modinfo) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->sfp_bus) return sfp_get_module_info(dev->sfp_bus, modinfo); if (phydev && phydev->drv && phydev->drv->module_info) return phydev->drv->module_info(phydev, modinfo); if (ops->get_module_info) return ops->get_module_info(dev, modinfo); return -EOPNOTSUPP; } static int ethtool_get_module_info(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) return -EFAULT; ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) return -EFAULT; return 0; } int ethtool_get_module_eeprom_call(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->sfp_bus) return sfp_get_module_eeprom(dev->sfp_bus, ee, data); if (phydev && phydev->drv && phydev->drv->module_eeprom) return phydev->drv->module_eeprom(phydev, ee, data); if (ops->get_module_eeprom) return ops->get_module_eeprom(dev, ee, data); return -EOPNOTSUPP; } static int ethtool_get_module_eeprom(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; return ethtool_get_any_eeprom(dev, useraddr, ethtool_get_module_eeprom_call, modinfo.eeprom_len); } static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: case ETHTOOL_TX_COPYBREAK: case ETHTOOL_TX_COPYBREAK_BUF_SIZE: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; case ETHTOOL_PFC_PREVENTION_TOUT: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->get_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; ret = ops->get_tunable(dev, &tuna, data); if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->set_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); ret = ops->set_tunable(dev, &tuna, data); kfree(data); return ret; } static noinline_for_stack int ethtool_get_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int ret; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if (!dev->ethtool_ops->get_per_queue_coalesce) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; useraddr += sizeof(coalesce); } return 0; } static noinline_for_stack int ethtool_set_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int i, ret = 0; int n_queue; struct ethtool_coalesce *backup = NULL, *tmp = NULL; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if ((!dev->ethtool_ops->set_per_queue_coalesce) || (!dev->ethtool_ops->get_per_queue_coalesce)) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); if (!backup) return -ENOMEM; for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); if (ret != 0) goto roll_back; tmp++; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { ret = -EFAULT; goto roll_back; } if (!ethtool_set_coalesce_supported(dev, &coalesce)) { ret = -EOPNOTSUPP; goto roll_back; } ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) goto roll_back; useraddr += sizeof(coalesce); } roll_back: if (ret != 0) { tmp = backup; for_each_set_bit(i, queue_mask, bit) { dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); tmp++; } } kfree(backup); return ret; } static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; if (per_queue_opt.sub_command != sub_cmd) return -EINVAL; switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); case ETHTOOL_SCOALESCE: return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); default: return -EOPNOTSUPP; } } static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_PHY_DOWNSHIFT: case ETHTOOL_PHY_FAST_LINK_DOWN: if (tuna->len != sizeof(u8) || tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; break; case ETHTOOL_PHY_EDPD: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int get_phy_tunable(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_tunable tuna; bool phy_drv_tunable; void *data; int ret; phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable; if (!phy_drv_tunable && !dev->ethtool_ops->get_phy_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; if (phy_drv_tunable) { mutex_lock(&phydev->lock); ret = phydev->drv->get_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); } else { ret = dev->ethtool_ops->get_phy_tunable(dev, &tuna, data); } if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int set_phy_tunable(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_tunable tuna; bool phy_drv_tunable; void *data; int ret; phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable; if (!phy_drv_tunable && !dev->ethtool_ops->set_phy_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); if (phy_drv_tunable) { mutex_lock(&phydev->lock); ret = phydev->drv->set_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); } else { ret = dev->ethtool_ops->set_phy_tunable(dev, &tuna, data); } kfree(data); return ret; } static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM }; int rc; if (!dev->ethtool_ops->get_fecparam) return -EOPNOTSUPP; rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); if (rc) return rc; if (WARN_ON_ONCE(fecparam.reserved)) fecparam.reserved = 0; if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; } static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam; if (!dev->ethtool_ops->set_fecparam) return -EOPNOTSUPP; if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; if (!fecparam.fec || fecparam.fec & ETHTOOL_FEC_NONE) return -EINVAL; fecparam.active_fec = 0; fecparam.reserved = 0; return dev->ethtool_ops->set_fecparam(dev, &fecparam); } /* The main entry point in this file. Called from net/core/dev_ioctl.c */ static int __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, u32 ethcmd, struct ethtool_devlink_compat *devlink_state) { struct net_device *dev; u32 sub_cmd; int rc; netdev_features_t old_features; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) return -ENODEV; if (ethcmd == ETHTOOL_PERQUEUE) { if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) return -EFAULT; } else { sub_cmd = ethcmd; } /* Allow some commands to be done by anyone */ switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: case ETHTOOL_GRXCSUM: case ETHTOOL_GTXCSUM: case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: case ETHTOOL_GRXFHINDIR: case ETHTOOL_GRSSH: case ETHTOOL_GFEATURES: case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: case ETHTOOL_GFECPARAM: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; } if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev)) { rc = -ENODEV; goto out; } if (dev->ethtool_ops->begin) { rc = dev->ethtool_ops->begin(dev); if (rc < 0) goto out; } old_features = dev->features; switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); break; case ETHTOOL_SSET: rc = ethtool_set_settings(dev, useraddr); break; case ETHTOOL_GDRVINFO: rc = ethtool_get_drvinfo(dev, devlink_state); break; case ETHTOOL_GREGS: rc = ethtool_get_regs(dev, useraddr); break; case ETHTOOL_GWOL: rc = ethtool_get_wol(dev, useraddr); break; case ETHTOOL_SWOL: rc = ethtool_set_wol(dev, useraddr); break; case ETHTOOL_GMSGLVL: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_msglevel); break; case ETHTOOL_SMSGLVL: rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); if (!rc) ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); break; case ETHTOOL_SEEE: rc = ethtool_set_eee(dev, useraddr); break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: rc = ethtool_get_link(dev, useraddr); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); break; case ETHTOOL_SEEPROM: rc = ethtool_set_eeprom(dev, useraddr); break; case ETHTOOL_GCOALESCE: rc = ethtool_get_coalesce(dev, useraddr); break; case ETHTOOL_SCOALESCE: rc = ethtool_set_coalesce(dev, useraddr); break; case ETHTOOL_GRINGPARAM: rc = ethtool_get_ringparam(dev, useraddr); break; case ETHTOOL_SRINGPARAM: rc = ethtool_set_ringparam(dev, useraddr); break; case ETHTOOL_GPAUSEPARAM: rc = ethtool_get_pauseparam(dev, useraddr); break; case ETHTOOL_SPAUSEPARAM: rc = ethtool_set_pauseparam(dev, useraddr); break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; case ETHTOOL_GSTRINGS: rc = ethtool_get_strings(dev, useraddr); break; case ETHTOOL_PHYS_ID: rc = ethtool_phys_id(dev, useraddr); break; case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, __ethtool_get_flags); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); break; case ETHTOOL_GPFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_priv_flags); if (!rc) ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); break; case ETHTOOL_SPFLAGS: rc = ethtool_set_value(dev, useraddr, dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_SRXFH: case ETHTOOL_SRXCLSRLDEL: case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, devlink_state); break; case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); break; case ETHTOOL_GSSET_INFO: rc = ethtool_get_sset_info(dev, useraddr); break; case ETHTOOL_GRXFHINDIR: rc = ethtool_get_rxfh_indir(dev, useraddr); break; case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; case ETHTOOL_GRSSH: rc = ethtool_get_rxfh(dev, useraddr); break; case ETHTOOL_SRSSH: rc = ethtool_set_rxfh(dev, useraddr); break; case ETHTOOL_GFEATURES: rc = ethtool_get_features(dev, useraddr); break; case ETHTOOL_SFEATURES: rc = ethtool_set_features(dev, useraddr); break; case ETHTOOL_GTXCSUM: case ETHTOOL_GRXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: rc = ethtool_get_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_STXCSUM: case ETHTOOL_SRXCSUM: case ETHTOOL_SSG: case ETHTOOL_STSO: case ETHTOOL_SGSO: case ETHTOOL_SGRO: rc = ethtool_set_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_GCHANNELS: rc = ethtool_get_channels(dev, useraddr); break; case ETHTOOL_SCHANNELS: rc = ethtool_set_channels(dev, useraddr); break; case ETHTOOL_SET_DUMP: rc = ethtool_set_dump(dev, useraddr); break; case ETHTOOL_GET_DUMP_FLAG: rc = ethtool_get_dump_flag(dev, useraddr); break; case ETHTOOL_GET_DUMP_DATA: rc = ethtool_get_dump_data(dev, useraddr); break; case ETHTOOL_GET_TS_INFO: rc = ethtool_get_ts_info(dev, useraddr); break; case ETHTOOL_GMODULEINFO: rc = ethtool_get_module_info(dev, useraddr); break; case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; case ETHTOOL_GTUNABLE: rc = ethtool_get_tunable(dev, useraddr); break; case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); break; case ETHTOOL_SLINKSETTINGS: rc = ethtool_set_link_ksettings(dev, useraddr); break; case ETHTOOL_PHY_GTUNABLE: rc = get_phy_tunable(dev, useraddr); break; case ETHTOOL_PHY_STUNABLE: rc = set_phy_tunable(dev, useraddr); break; case ETHTOOL_GFECPARAM: rc = ethtool_get_fecparam(dev, useraddr); break; case ETHTOOL_SFECPARAM: rc = ethtool_set_fecparam(dev, useraddr); break; default: rc = -EOPNOTSUPP; } if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (old_features != dev->features) netdev_features_change(dev); out: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return rc; } int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) { struct ethtool_devlink_compat *state; u32 ethcmd; int rc; if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd))) return -EFAULT; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; switch (ethcmd) { case ETHTOOL_FLASHDEV: if (copy_from_user(&state->efl, useraddr, sizeof(state->efl))) { rc = -EFAULT; goto exit_free; } state->efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; break; } rtnl_lock(); rc = __dev_ethtool(net, ifr, useraddr, ethcmd, state); rtnl_unlock(); if (rc) goto exit_free; switch (ethcmd) { case ETHTOOL_FLASHDEV: if (state->devlink) rc = devlink_compat_flash_update(state->devlink, state->efl.data); break; case ETHTOOL_GDRVINFO: if (state->devlink) devlink_compat_running_version(state->devlink, state->info.fw_version, sizeof(state->info.fw_version)); if (copy_to_user(useraddr, &state->info, sizeof(state->info))) { rc = -EFAULT; goto exit_free; } break; } exit_free: if (state->devlink) devlink_put(state->devlink); kfree(state); return rc; } struct ethtool_rx_flow_key { struct flow_dissector_key_basic basic; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_eth_addrs eth_addrs; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct ethtool_rx_flow_match { struct flow_dissector dissector; struct ethtool_rx_flow_key key; struct ethtool_rx_flow_key mask; }; struct ethtool_rx_flow_rule * ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) { const struct ethtool_rx_flow_spec *fs = input->fs; struct ethtool_rx_flow_match *match; struct ethtool_rx_flow_rule *flow; struct flow_action_entry *act; flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); /* ethtool_rx supports only one single action per rule. */ flow->rule = flow_rule_alloc(1); if (!flow->rule) { kfree(flow); return ERR_PTR(-ENOMEM); } match = (struct ethtool_rx_flow_match *)flow->priv; flow->rule->match.dissector = &match->dissector; flow->rule->match.mask = &match->mask; flow->rule->match.key = &match->key; match->mask.basic.n_proto = htons(0xffff); switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { case ETHER_FLOW: { const struct ethhdr *ether_spec, *ether_m_spec; ether_spec = &fs->h_u.ether_spec; ether_m_spec = &fs->m_u.ether_spec; if (!is_zero_ether_addr(ether_m_spec->h_source)) { ether_addr_copy(match->key.eth_addrs.src, ether_spec->h_source); ether_addr_copy(match->mask.eth_addrs.src, ether_m_spec->h_source); } if (!is_zero_ether_addr(ether_m_spec->h_dest)) { ether_addr_copy(match->key.eth_addrs.dst, ether_spec->h_dest); ether_addr_copy(match->mask.eth_addrs.dst, ether_m_spec->h_dest); } if (ether_m_spec->h_proto) { match->key.basic.n_proto = ether_spec->h_proto; match->mask.basic.n_proto = ether_m_spec->h_proto; } } break; case TCP_V4_FLOW: case UDP_V4_FLOW: { const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; match->key.basic.n_proto = htons(ETH_P_IP); v4_spec = &fs->h_u.tcp_ip4_spec; v4_m_spec = &fs->m_u.tcp_ip4_spec; if (v4_m_spec->ip4src) { match->key.ipv4.src = v4_spec->ip4src; match->mask.ipv4.src = v4_m_spec->ip4src; } if (v4_m_spec->ip4dst) { match->key.ipv4.dst = v4_spec->ip4dst; match->mask.ipv4.dst = v4_m_spec->ip4dst; } if (v4_m_spec->ip4src || v4_m_spec->ip4dst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv4); } if (v4_m_spec->psrc) { match->key.tp.src = v4_spec->psrc; match->mask.tp.src = v4_m_spec->psrc; } if (v4_m_spec->pdst) { match->key.tp.dst = v4_spec->pdst; match->mask.tp.dst = v4_m_spec->pdst; } if (v4_m_spec->psrc || v4_m_spec->pdst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } if (v4_m_spec->tos) { match->key.ip.tos = v4_spec->tos; match->mask.ip.tos = v4_m_spec->tos; match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } } break; case TCP_V6_FLOW: case UDP_V6_FLOW: { const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; match->key.basic.n_proto = htons(ETH_P_IPV6); v6_spec = &fs->h_u.tcp_ip6_spec; v6_m_spec = &fs->m_u.tcp_ip6_spec; if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) { memcpy(&match->key.ipv6.src, v6_spec->ip6src, sizeof(match->key.ipv6.src)); memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, sizeof(match->mask.ipv6.src)); } if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, sizeof(match->key.ipv6.dst)); memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, sizeof(match->mask.ipv6.dst)); } if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) || !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv6); } if (v6_m_spec->psrc) { match->key.tp.src = v6_spec->psrc; match->mask.tp.src = v6_m_spec->psrc; } if (v6_m_spec->pdst) { match->key.tp.dst = v6_spec->pdst; match->mask.tp.dst = v6_m_spec->pdst; } if (v6_m_spec->psrc || v6_m_spec->pdst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } if (v6_m_spec->tclass) { match->key.ip.tos = v6_spec->tclass; match->mask.ip.tos = v6_m_spec->tclass; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } } break; default: ethtool_rx_flow_rule_destroy(flow); return ERR_PTR(-EINVAL); } switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { case TCP_V4_FLOW: case TCP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_TCP; match->mask.basic.ip_proto = 0xff; break; case UDP_V4_FLOW: case UDP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_UDP; match->mask.basic.ip_proto = 0xff; break; } match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = offsetof(struct ethtool_rx_flow_key, basic); if (fs->flow_type & FLOW_EXT) { const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; if (ext_m_spec->vlan_etype) { match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; } if (ext_m_spec->vlan_tci) { match->key.vlan.vlan_id = ntohs(ext_h_spec->vlan_tci) & 0x0fff; match->mask.vlan.vlan_id = ntohs(ext_m_spec->vlan_tci) & 0x0fff; match->key.vlan.vlan_dei = !!(ext_h_spec->vlan_tci & htons(0x1000)); match->mask.vlan.vlan_dei = !!(ext_m_spec->vlan_tci & htons(0x1000)); match->key.vlan.vlan_priority = (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; match->mask.vlan.vlan_priority = (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; } if (ext_m_spec->vlan_etype || ext_m_spec->vlan_tci) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct ethtool_rx_flow_key, vlan); } } if (fs->flow_type & FLOW_MAC_EXT) { const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, ETH_ALEN); memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, ETH_ALEN); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = offsetof(struct ethtool_rx_flow_key, eth_addrs); } act = &flow->rule->action.entries[0]; switch (fs->ring_cookie) { case RX_CLS_FLOW_DISC: act->id = FLOW_ACTION_DROP; break; case RX_CLS_FLOW_WAKE: act->id = FLOW_ACTION_WAKE; break; default: act->id = FLOW_ACTION_QUEUE; if (fs->flow_type & FLOW_RSS) act->queue.ctx = input->rss_ctx; act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); break; } return flow; } EXPORT_SYMBOL(ethtool_rx_flow_rule_create); void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) { kfree(flow->rule); kfree(flow); } EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy);
33 1075 191 197 3160 93 93 73 122 393 1664 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } CGROUP_REF_EXPORT(css_get) /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_get_many) /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget) /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget_online) /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } CGROUP_REF_EXPORT(css_put) /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_put_many)
6 220 246 21 224 511 513 299 515 6 350 12 11 144 145 350 350 347 23 238 151 4 79 126 316 400 399 171 3 223 34 95 104 187 186 176 157 156 206 198 6 4 200 16 9 158 158 162 205 206 208 207 10 155 45 198 7 161 160 143 161 161 98 93 134 33 96 66 157 1 178 68 33 44 68 211 1 211 211 209 209 209 209 208 10 98 82 175 1 190 22 1 34 56 55 84 1 75 2 158 130 1 35 163 162 161 346 342 249 134 9 383 360 144 26 190 170 77 102 170 385 2 25 1 359 9 384 1 9 336 336 338 1 21 14 7 21 2 144 17 127 114 14 7 247 112 1 134 3 241 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. */ if (rt_task(current)) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; } static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, busy_flag); mask = vfs_poll(f.file, wait); fdput(f); } if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask = 0, filter; struct fd f; if (fd < 0) goto out; mask = EPOLLNVAL; f = fdget(fd); if (!f.file) goto out; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(f.file, pwait); if (mask & busy_flag) *can_busy_poll = true; mask &= filter; /* Mask out unneeded events. */ fdput(f); out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); return mask; } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
91 91 31 91 88 67 68 91 91 59 60 1 61 61 60 18 18 18 62 63 63 1 1 1 13 69 69 69 68 61 31 31 31 31 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2018 Red Hat, Inc. * All rights reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_rmap_btree.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_defer.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_icache.h" /* * Passive reference counting access wrappers to the perag structures. If the * per-ag structure is to be freed, the freeing code is responsible for cleaning * up objects with passive references before freeing the structure. This is * things like cached buffers. */ struct xfs_perag * xfs_perag_get( struct xfs_mount *mp, xfs_agnumber_t agno) { struct xfs_perag *pag; rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { trace_xfs_perag_get(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) >= 0); atomic_inc(&pag->pag_ref); } rcu_read_unlock(); return pag; } /* * search from @first to find the next perag with the given tag set. */ struct xfs_perag * xfs_perag_get_tag( struct xfs_mount *mp, xfs_agnumber_t first, unsigned int tag) { struct xfs_perag *pag; int found; rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, (void **)&pag, first, 1, tag); if (found <= 0) { rcu_read_unlock(); return NULL; } trace_xfs_perag_get_tag(pag, _RET_IP_); atomic_inc(&pag->pag_ref); rcu_read_unlock(); return pag; } /* Get a passive reference to the given perag. */ struct xfs_perag * xfs_perag_hold( struct xfs_perag *pag) { ASSERT(atomic_read(&pag->pag_ref) > 0 || atomic_read(&pag->pag_active_ref) > 0); trace_xfs_perag_hold(pag, _RET_IP_); atomic_inc(&pag->pag_ref); return pag; } void xfs_perag_put( struct xfs_perag *pag) { trace_xfs_perag_put(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) > 0); atomic_dec(&pag->pag_ref); } /* * Active references for perag structures. This is for short term access to the * per ag structures for walking trees or accessing state. If an AG is being * shrunk or is offline, then this will fail to find that AG and return NULL * instead. */ struct xfs_perag * xfs_perag_grab( struct xfs_mount *mp, xfs_agnumber_t agno) { struct xfs_perag *pag; rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { trace_xfs_perag_grab(pag, _RET_IP_); if (!atomic_inc_not_zero(&pag->pag_active_ref)) pag = NULL; } rcu_read_unlock(); return pag; } /* * search from @first to find the next perag with the given tag set. */ struct xfs_perag * xfs_perag_grab_tag( struct xfs_mount *mp, xfs_agnumber_t first, int tag) { struct xfs_perag *pag; int found; rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, (void **)&pag, first, 1, tag); if (found <= 0) { rcu_read_unlock(); return NULL; } trace_xfs_perag_grab_tag(pag, _RET_IP_); if (!atomic_inc_not_zero(&pag->pag_active_ref)) pag = NULL; rcu_read_unlock(); return pag; } void xfs_perag_rele( struct xfs_perag *pag) { trace_xfs_perag_rele(pag, _RET_IP_); if (atomic_dec_and_test(&pag->pag_active_ref)) wake_up(&pag->pag_active_wq); } /* * xfs_initialize_perag_data * * Read in each per-ag structure so we can count up the number of * allocated inodes, free inodes and used filesystem blocks as this * information is no longer persistent in the superblock. Once we have * this information, write it into the in-core superblock structure. */ int xfs_initialize_perag_data( struct xfs_mount *mp, xfs_agnumber_t agcount) { xfs_agnumber_t index; struct xfs_perag *pag; struct xfs_sb *sbp = &mp->m_sb; uint64_t ifree = 0; uint64_t ialloc = 0; uint64_t bfree = 0; uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; int error = 0; for (index = 0; index < agcount; index++) { /* * Read the AGF and AGI buffers to populate the per-ag * structures for us. */ pag = xfs_perag_get(mp, index); error = xfs_alloc_read_agf(pag, NULL, 0, NULL); if (!error) error = xfs_ialloc_read_agi(pag, NULL, NULL); if (error) { xfs_perag_put(pag); return error; } ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; bfreelst += pag->pagf_flcount; btree += pag->pagf_btreeblks; xfs_perag_put(pag); } fdblocks = bfree + bfreelst + btree; /* * If the new summary counts are obviously incorrect, fail the * mount operation because that implies the AGFs are also corrupt. * Clear FS_COUNTERS so that we don't unmount with a dirty log, which * will prevent xfs_repair from fixing anything. */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { xfs_alert(mp, "AGF corruption. Please run xfs_repair."); error = -EFSCORRUPTED; goto out; } /* Overwrite incore superblock counters with just-read data */ spin_lock(&mp->m_sb_lock); sbp->sb_ifree = ifree; sbp->sb_icount = ialloc; sbp->sb_fdblocks = fdblocks; spin_unlock(&mp->m_sb_lock); xfs_reinit_percpu_counters(mp); out: xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); return error; } STATIC void __xfs_free_perag( struct rcu_head *head) { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); kfree(pag); } /* * Free up the per-ag resources associated with the mount structure. */ void xfs_free_perag( struct xfs_mount *mp) { struct xfs_perag *pag; xfs_agnumber_t agno; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); /* drop the mount's active reference */ xfs_perag_rele(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_active_ref) != 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } /* Find the size of the AG, in blocks. */ static xfs_agblock_t __xfs_ag_block_count( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agnumber_t agcount, xfs_rfsblock_t dblocks) { ASSERT(agno < agcount); if (agno < agcount - 1) return mp->m_sb.sb_agblocks; return dblocks - (agno * mp->m_sb.sb_agblocks); } xfs_agblock_t xfs_ag_block_count( struct xfs_mount *mp, xfs_agnumber_t agno) { return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks); } /* Calculate the first and last possible inode number in an AG. */ static void __xfs_agino_range( struct xfs_mount *mp, xfs_agblock_t eoag, xfs_agino_t *first, xfs_agino_t *last) { xfs_agblock_t bno; /* * Calculate the first inode, which will be in the first * cluster-aligned block after the AGFL. */ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); *first = XFS_AGB_TO_AGINO(mp, bno); /* * Calculate the last inode, which will be at the end of the * last (aligned) cluster that can be allocated in the AG. */ bno = round_down(eoag, M_IGEO(mp)->cluster_align); *last = XFS_AGB_TO_AGINO(mp, bno) - 1; } void xfs_agino_range( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last) { return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } /* * Free perag within the specified AG range, it is only used to free unused * perags under the error handling path. */ void xfs_free_unused_perag_range( struct xfs_mount *mp, xfs_agnumber_t agstart, xfs_agnumber_t agend) { struct xfs_perag *pag; xfs_agnumber_t index; for (index = agstart; index < agend; index++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, index); spin_unlock(&mp->m_perag_lock); if (!pag) break; xfs_buf_hash_destroy(pag); xfs_defer_drain_free(&pag->pag_intents_drain); kfree(pag); } } int xfs_initialize_perag( struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dblocks, xfs_agnumber_t *maxagi) { struct xfs_perag *pag; xfs_agnumber_t index; xfs_agnumber_t first_initialised = NULLAGNUMBER; int error; /* * Walk the current per-ag tree so we don't try to initialise AGs * that already exist (growfs case). Allocate and insert all the * AGs we don't find ready for initialisation. */ for (index = 0; index < agcount; index++) { pag = xfs_perag_get(mp, index); if (pag) { xfs_perag_put(pag); continue; } pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!pag) { error = -ENOMEM; goto out_unwind_new_pags; } pag->pag_agno = index; pag->pag_mount = mp; error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error) goto out_free_pag; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { WARN_ON_ONCE(1); spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; goto out_free_pag; } spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); #ifdef __KERNEL__ /* Place kernel structure only init below this point. */ spin_lock_init(&pag->pag_ici_lock); spin_lock_init(&pag->pagb_lock); spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); xfs_defer_drain_init(&pag->pag_intents_drain); init_waitqueue_head(&pag->pagb_wait); init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ error = xfs_buf_hash_init(pag); if (error) goto out_remove_pag; /* Active ref owned by mount indicates AG is online. */ atomic_set(&pag->pag_active_ref, 1); /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; /* * Pre-calculated geometry */ pag->block_count = __xfs_ag_block_count(mp, index, agcount, dblocks); pag->min_block = XFS_AGFL_BLOCK(mp); __xfs_agino_range(mp, pag->block_count, &pag->agino_min, &pag->agino_max); } index = xfs_set_inode_alloc(mp, agcount); if (maxagi) *maxagi = index; mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); spin_lock(&mp->m_perag_lock); radix_tree_delete(&mp->m_perag_tree, index); spin_unlock(&mp->m_perag_lock); out_free_pag: kfree(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ xfs_free_unused_perag_range(mp, first_initialised, agcount); return error; } static int xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; int error; error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); if (error) return error; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; *bpp = bp; return 0; } /* * Generic btree root block init function */ static void xfs_btroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno); } /* Finish initializing a free space btree. */ static void xfs_freesp_init_recs( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_alloc_rec *arec; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); if (xfs_ag_contains_log(mp, id->agno)) { struct xfs_alloc_rec *nrec; xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart); ASSERT(start >= mp->m_ag_prealloc_blocks); if (start != mp->m_ag_prealloc_blocks) { /* * Modify first record to pad stripe align of log and * bump the record count. */ arec->ar_blockcount = cpu_to_be32(start - mp->m_ag_prealloc_blocks); be16_add_cpu(&block->bb_numrecs, 1); nrec = arec + 1; /* * Insert second record at start of internal log * which then gets trimmed. */ nrec->ar_startblock = cpu_to_be32( be32_to_cpu(arec->ar_startblock) + be32_to_cpu(arec->ar_blockcount)); arec = nrec; } /* * Change record start to after the internal log */ be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks); } /* * Calculate the block count of this record; if it is nonzero, * increment the record count. */ arec->ar_blockcount = cpu_to_be32(id->agsize - be32_to_cpu(arec->ar_startblock)); if (arec->ar_blockcount) be16_add_cpu(&block->bb_numrecs, 1); } /* * Alloc btree root block init functions */ static void xfs_bnoroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } static void xfs_cntroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } /* * Reverse map root block init */ static void xfs_rmaproot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO * btree block is the first block after the headers, so * it's location defines the size of region the static * metadata consumes. * * Note: unlike mkfs, we never have to account for log * space when growing the data regions */ rrec = XFS_RMAP_REC_ADDR(block, 1); rrec->rm_startblock = 0; rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); rrec->rm_offset = 0; /* account freespace btree root blocks */ rrec = XFS_RMAP_REC_ADDR(block, 2); rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); rrec->rm_blockcount = cpu_to_be32(2); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); rrec->rm_offset = 0; /* account inode btree root blocks */ rrec = XFS_RMAP_REC_ADDR(block, 3); rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - XFS_IBT_BLOCK(mp)); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); rrec->rm_offset = 0; /* account for rmap btree root */ rrec = XFS_RMAP_REC_ADDR(block, 4); rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); rrec->rm_blockcount = cpu_to_be32(1); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); rrec->rm_offset = 0; /* account for refc btree root */ if (xfs_has_reflink(mp)) { rrec = XFS_RMAP_REC_ADDR(block, 5); rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); rrec->rm_blockcount = cpu_to_be32(1); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); } /* account for the log space */ if (xfs_ag_contains_log(mp, id->agno)) { rrec = XFS_RMAP_REC_ADDR(block, be16_to_cpu(block->bb_numrecs) + 1); rrec->rm_startblock = cpu_to_be32( XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart)); rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG); rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); } } /* * Initialise new secondary superblocks with the pre-grow geometry, but mark * them as "in progress" so we know they haven't yet been activated. This will * get cleared when the update with the new geometry information is done after * changes to the primary are committed. This isn't strictly necessary, but we * get it for free with the delayed buffer write lists and it means we can tell * if a grow operation didn't complete properly after the fact. */ static void xfs_sbblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; } static void xfs_agfblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(id->agno); agf->agf_length = cpu_to_be32(id->agsize); agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); if (xfs_has_rmapbt(mp)) { agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_RMAP_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); agf->agf_rmap_blocks = cpu_to_be32(1); } agf->agf_flfirst = cpu_to_be32(1); agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = id->agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_has_reflink(mp)) { agf->agf_refcount_root = cpu_to_be32( xfs_refc_block(mp)); agf->agf_refcount_level = cpu_to_be32(1); agf->agf_refcount_blocks = cpu_to_be32(1); } if (xfs_ag_contains_log(mp, id->agno)) { int64_t logblocks = mp->m_sb.sb_logblocks; be32_add_cpu(&agf->agf_freeblks, -logblocks); agf->agf_longest = cpu_to_be32(id->agsize - XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks); } } static void xfs_agflblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); __be32 *agfl_bno; int bucket; if (xfs_has_crc(mp)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(id->agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } static void xfs_agiblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(id->agno); agi->agi_length = cpu_to_be32(id->agsize); agi->agi_count = 0; agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); agi->agi_level = cpu_to_be32(1); agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_has_finobt(mp)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); if (xfs_has_inobtcounts(mp)) { agi->agi_iblocks = cpu_to_be32(1); if (xfs_has_finobt(mp)) agi->agi_fblocks = cpu_to_be32(1); } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id); static int xfs_ag_init_hdr( struct xfs_mount *mp, struct aghdr_init_data *id, aghdr_init_work_f work, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; int error; error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops); if (error) return error; (*work)(mp, bp, id); xfs_buf_delwri_queue(bp, &id->buffer_list); xfs_buf_relse(bp); return 0; } struct xfs_aghdr_grow_data { xfs_daddr_t daddr; size_t numblks; const struct xfs_buf_ops *ops; aghdr_init_work_f work; xfs_btnum_t type; bool need_init; }; /* * Prepare new AG headers to be written to disk. We use uncached buffers here, * as it is assumed these new AG headers are currently beyond the currently * valid filesystem address space. Using cached buffers would trip over EOFS * corruption detection alogrithms in the buffer cache lookup routines. * * This is a non-transactional function, but the prepared buffers are added to a * delayed write buffer list supplied by the caller so they can submit them to * disk and wait on them as required. */ int xfs_ag_init_headers( struct xfs_mount *mp, struct aghdr_init_data *id) { struct xfs_aghdr_grow_data aghdr_data[] = { { /* SB */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_sb_buf_ops, .work = &xfs_sbblock_init, .need_init = true }, { /* AGF */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_agf_buf_ops, .work = &xfs_agfblock_init, .need_init = true }, { /* AGFL */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_agfl_buf_ops, .work = &xfs_agflblock_init, .need_init = true }, { /* AGI */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_agi_buf_ops, .work = &xfs_agiblock_init, .need_init = true }, { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, { /* INO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_inobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_INO, .need_init = true }, { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_REFC, .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ .daddr = XFS_BUF_DADDR_NULL, } }; struct xfs_aghdr_grow_data *dp; int error = 0; /* Account for AG free space in new AG */ id->nfree += id->agsize - mp->m_ag_prealloc_blocks; for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) { if (!dp->need_init) continue; id->daddr = dp->daddr; id->numblks = dp->numblks; id->type = dp->type; error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); if (error) break; } return error; } int xfs_ag_shrink_space( struct xfs_perag *pag, struct xfs_trans **tpp, xfs_extlen_t delta) { struct xfs_mount *mp = pag->pag_mount; struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, .pag = pag, .minlen = delta, .maxlen = delta, .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, .resv = XFS_AG_RESV_NONE, .prod = 1 }; struct xfs_buf *agibp, *agfbp; struct xfs_agi *agi; struct xfs_agf *agf; xfs_agblock_t aglen; int error, err2; ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, *tpp, &agibp); if (error) return error; agi = agibp->b_addr; error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp); if (error) return error; agf = agfbp->b_addr; aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) return -EFSCORRUPTED; if (delta >= aglen) return -EINVAL; /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ error = xfs_ialloc_check_shrink(pag, *tpp, agibp, aglen - delta); if (error) return error; /* * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ error = xfs_ag_resv_free(pag); if (error) return error; /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; if (error) { /* * if extent allocation fails, need to roll the transaction to * ensure that the AGFL fixup has been committed anyway. */ xfs_trans_bhold(*tpp, agfbp); err2 = xfs_trans_roll(tpp); if (err2) return err2; xfs_trans_bjoin(*tpp, agfbp); goto resv_init_out; } /* * if successfully deleted from freespace btrees, need to confirm * per-AG reservation works as expected. */ be32_add_cpu(&agi->agi_length, -delta); be32_add_cpu(&agf->agf_length, -delta); err2 = xfs_ag_resv_init(pag, *tpp); if (err2) { be32_add_cpu(&agi->agi_length, delta); be32_add_cpu(&agf->agf_length, delta); if (err2 != -ENOSPC) goto resv_err; err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; /* * Roll the transaction before trying to re-init the per-ag * reservation. The new transaction is clean so it will cancel * without any side effects. */ error = xfs_defer_finish(tpp); if (error) return error; error = -ENOSPC; goto resv_init_out; } /* Update perag geometry */ pag->block_count -= delta; __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, &pag->agino_max); xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; resv_init_out: err2 = xfs_ag_resv_init(pag, *tpp); if (!err2) return error; resv_err: xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return err2; } /* * Extent the AG indicated by the @id by the length passed in */ int xfs_ag_extend_space( struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len) { struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; int error; ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, tp, &bp); if (error) return error; agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); /* * Change agf length. */ error = xfs_alloc_read_agf(pag, tp, 0, &bp); if (error) return error; agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); /* * Free the new space. * * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); if (error) return error; /* Update perag geometry */ pag->block_count = be32_to_cpu(agf->agf_length); __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, &pag->agino_max); return 0; } /* Retrieve AG geometry. */ int xfs_ag_get_geometry( struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { struct xfs_buf *agi_bp; struct xfs_buf *agf_bp; struct xfs_agi *agi; struct xfs_agf *agf; unsigned int freeblks; int error; /* Lock the AG headers. */ error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); if (error) goto out_agi; /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); ageo->ag_number = pag->pag_agno; agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + pag->pagf_btreeblks - xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE); ageo->ag_freeblks = freeblks; xfs_ag_geom_health(pag, ageo); /* Release resources. */ xfs_buf_relse(agf_bp); out_agi: xfs_buf_relse(agi_bp); return error; }
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 /* * linux/fs/nls/mac-romanian.c * * Charset macromanian translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x0102, 0x0218, /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x0103, 0x0219, /* 0xc0 */ 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0x021a, 0x021b, /* 0xe0 */ 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* 0xf0 */ 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0x00, 0x82, /* 0xc0-0xc7 */ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ 0x00, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0x00, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0x00, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0xae, 0xbe, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xaf, 0xbf, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page25[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char pagef8[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macromanian", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macromanian(void) { return register_nls(&table); } static void __exit exit_nls_macromanian(void) { unregister_nls(&table); } module_init(init_nls_macromanian) module_exit(exit_nls_macromanian) MODULE_LICENSE("Dual BSD/GPL");
1 1 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 // SPDX-License-Identifier: GPL-2.0-only /* * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): * Murali Karicheri <m-karicheri2@ti.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/debugfs.h> #include "hsr_main.h" #include "hsr_framereg.h" static struct dentry *hsr_debugfs_root_dir; /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); seq_puts(sfp, "time_in[B], Address-B port, "); if (priv->prot_version == PRP_V1) seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); else seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; seq_printf(sfp, "%pM ", &node->macaddress_A[0]); seq_printf(sfp, "%pM ", &node->macaddress_B[0]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); seq_printf(sfp, "%14x, ", node->addr_B_port); if (priv->prot_version == PRP_V1) seq_printf(sfp, "%5x, %5x, %5x\n", node->san_a, node->san_b, (node->san_a == 0 && node->san_b == 0)); else seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); struct dentry *d; d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, hsr_debugfs_root_dir, dev->name); if (IS_ERR(d)) netdev_warn(dev, "failed to rename\n"); else priv->node_tbl_root = d; } /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs directory\n"); return; } priv->node_tbl_root = de; de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; } } /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove_recursive(priv->node_tbl_root); priv->node_tbl_root = NULL; } void hsr_debugfs_create_root(void) { hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); if (IS_ERR(hsr_debugfs_root_dir)) { pr_err("Cannot create hsr debugfs root directory\n"); hsr_debugfs_root_dir = NULL; } } void hsr_debugfs_remove_root(void) { /* debugfs_remove() internally checks NULL and ERROR */ debugfs_remove(hsr_debugfs_root_dir); }
23 23 23 23 23 23 23 23 23 23 23 4 4 1803 1379 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu-vm.c - vmalloc area based chunk allocation * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ #include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { /* must not be used on pre-mapped chunk */ WARN_ON(chunk->immutable); return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /** * pcpu_get_pages - get temp pages array * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses * should be serialized by pcpu_alloc_mutex. * * RETURNS: * Pointer to temp pages array on success. */ static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } /** * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * * Free pages [@page_start and @page_end) in @pages for all units. * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page = pages[pcpu_page_idx(cpu, i)]; if (page) __free_page(page); } } } /** * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end, gfp_t gfp) { unsigned int cpu, tcpu; int i; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); if (!*pagep) goto err; } } return 0; err: while (--i >= page_start) __free_page(pages[pcpu_page_idx(cpu, i)]); for_each_possible_cpu(tcpu) { if (tcpu == cpu) break; for (i = page_start; i < page_end; i++) __free_page(pages[pcpu_page_idx(tcpu, i)]); } return -ENOMEM; } /** * pcpu_pre_unmap_flush - flush cache prior to unmapping * @chunk: chunk the regions to be flushed belongs to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages in [@page_start,@page_end) of @chunk are about to be * unmapped. Flush cache. As each flushing trial can be very * expensive, issue flush on the whole region at once rather than * doing it for each cpu. This could be an overkill but is more * scalable. */ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * Corresponding elements in @pages were cleared by the caller and can * be used to carry information to pcpu_free_pages() which will be * called after all unmaps are finished. The caller should call * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page; page = pcpu_chunk_page(chunk, cpu, i); WARN_ON(!page); pages[pcpu_page_idx(cpu, i)] = page; } __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } } /** * pcpu_post_unmap_tlb_flush - flush TLB after unmapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush * TLB for the regions. This can be skipped if the area is to be * returned to vmalloc as vmalloc will handle TLB flushing lazily. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), PAGE_KERNEL, pages, PAGE_SHIFT); } /** * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * * For each cpu, map pages [@page_start,@page_end) into @chunk. The * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * * This function is responsible for setting up whatever is necessary for * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) goto err; for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); } return 0; err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) break; __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); } pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } /** * pcpu_post_map_flush - flush cache after mapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been mapped. Flush * cache. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp) { struct page **pages; pages = pcpu_get_pages(); if (!pages) return -ENOMEM; if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { pcpu_free_pages(chunk, pages, page_start, page_end); return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @page_start: the start page * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. * * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the * region back to vmalloc() which will lazily flush the tlb. * * CONTEXT: * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end) { struct page **pages; /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); pcpu_unmap_pages(chunk, pages, page_start, page_end); pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; } chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(chunk->base_addr); return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; pcpu_stats_chunk_dealloc(); trace_percpu_destroy_chunk(chunk->base_addr); if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } static struct page *pcpu_addr_to_page(void *addr) { return vmalloc_to_page(addr); } static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) { /* no extra restriction */ return 0; } /** * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim * @chunk: chunk of interest * * This is the entry point for percpu reclaim. If a chunk qualifies, it is then * isolated and managed in separate lists at the back of pcpu_slot: sidelined * and to_depopulate respectively. The to_depopulate list holds chunks slated * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once * they are on this list. Once depopulated, they are moved onto the sidelined * list which enables them to be pulled back in for allocation if no other chunk * can suffice the allocation. */ static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) { /* do not reclaim either the first chunk or reserved chunk */ if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) return false; /* * If it is isolated, it may be on the sidelined list so move it back to * the to_depopulate list. If we hit at least 1/4 pages empty pages AND * there is no system-wide shortage of empty pages aside from this * chunk, move it to the to_depopulate list. */ return ((chunk->isolated && chunk->nr_empty_pop_pages) || (pcpu_nr_empty_pop_pages > (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); }
877 391 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP protocol. * * Version: @(#)tcp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_TCP_H #define _LINUX_TCP_H #include <linux/skbuff.h> #include <linux/win_minmax.h> #include <net/sock.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <uapi/linux/tcp.h> static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_transport_header(skb); } static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) { return th->doff * 4; } static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_inner_transport_header(skb); } static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb) { return inner_tcp_hdr(skb)->doff * 4; } /** * skb_tcp_all_headers - Returns size of all headers for a TCP packet * @skb: buffer * * Used in TX path, for a packet known to be a TCP one. * * if (skb_is_gso(skb)) { * int hlen = skb_tcp_all_headers(skb); * ... */ static inline int skb_tcp_all_headers(const struct sk_buff *skb) { return skb_transport_offset(skb) + tcp_hdrlen(skb); } /** * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet * @skb: buffer * * Used in TX path, for a packet known to be a TCP one. * * if (skb_is_gso(skb) && skb->encapsulation) { * int hlen = skb_inner_tcp_all_headers(skb); * ... */ static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb) { return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); } static inline unsigned int tcp_optlen(const struct sk_buff *skb) { return (tcp_hdr(skb)->doff - 5) * 4; } /* TCP Fast Open */ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { __le64 val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))]; s8 len; bool exp; /* In RFC6994 experimental option format */ }; /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; __be32 end_seq; }; struct tcp_sack_block { u32 start_seq; u32 end_seq; }; /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ sack_ok : 3, /* SACK seen on SYN packet */ smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif } /* This is the max number of SACKS that we'll generate and process. It's safe * to increase this, although since: * size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8) * only four options will fit in a standard TCP header */ #define TCP_NUM_SACKS 4 struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; bool req_usec_ts; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif u32 txhash; u32 rcv_isn; u32 snt_isn; u32 ts_off; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. */ u8 syn_tos; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; bool used_tcp_ao; #endif }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) { return (struct tcp_request_sock *)req; } static inline bool tcp_rsk_used_ao(const struct request_sock *req) { #ifndef CONFIG_TCP_AO return false; #else return tcp_rsk(req)->used_tcp_ao; #endif } #define TCP_RMEM_TO_WIN_SCALE 8 struct tcp_sock { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/tcp_sock.rst. * Please update the document when adding new fields. */ /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; /* TX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_tx); /* timestamp of last sent data packet (for restart window) */ u32 max_window; /* Maximal window ever seen from peer */ u32 rcv_ssthresh; /* Current window clamp */ u32 reordering; /* Packet reordering metric. */ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ struct sk_buff *lost_skb_hint; struct sk_buff *retransmit_skb_hint; __cacheline_group_end(tcp_sock_read_tx); /* TXRX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_txrx); u32 tsoffset; /* timestamp offset */ u32 snd_wnd; /* The window we expect to receive */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 snd_cwnd; /* Sending congestion window */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ u8 scaling_ratio; /* see tcp_win_from_space() */ u8 chrono_type : 2, /* current chronograph type */ repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ __cacheline_group_end(tcp_sock_read_txrx); /* RX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_rx); u32 copied_seq; /* Head of yet unread data */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 snd_wl1; /* Sequence for window update */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 rttvar_us; /* smoothed mdev_max */ u32 retrans_out; /* Retransmitted packets out */ u16 advmss; /* Advertised MSS */ u16 urg_data; /* Saved octet of OOB data and control flags */ u32 lost; /* Total data packets lost incl. rexmits */ struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; u32 snd_ssthresh; /* Slow start size threshold */ __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned; u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut * total number of data bytes sent. */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lsndtime; u32 mdev_us; /* medium deviation */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ u32 rtt_seq; /* sequence number to update rttvar */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ u8 ecn_flags; /* ECN status bits. */ __cacheline_group_end(tcp_sock_write_tx); /* TXRX read-write hotpath cache lines */ __cacheline_group_begin(tcp_sock_write_txrx); /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; u32 rcv_nxt; /* What we want to receive next */ u32 snd_nxt; /* Next sequence we send */ u32 snd_una; /* First byte we want an ack for */ u32 window_clamp; /* Maximal window to advertise */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 packets_out; /* Packets which are "in flight" */ u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ __cacheline_group_begin(tcp_sock_write_rx); u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 max_packets_out; /* max packets_out in last window */ u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ struct { u32 rtt_us; u32 seq; u64 time; } rcv_rtt_est; /* Receiver queue space */ struct { u32 space; u32 seq; u64 time; } rcvq_space; __cacheline_group_end(tcp_sock_write_rx); /* End of Hot Path */ /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */ u8 reo_wnd_steps; /* Allowed reordering window */ #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ unused:5; u8 thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ fastopen_client_fail:2, /* reason why fastopen failed */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reord_seen; /* number of data packet reordering events */ /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ u32 total_retrans; /* Total retransmits for entire connection */ u32 rto_stamp; /* Start time (ms) of last CA_Loss recovery */ u16 total_rto; /* Total number of RTO timeouts, including * SYN/SYN-ACK and recurring timeouts. */ u16 total_rto_recoveries; /* Total number of RTO recoveries, * including any unfinished recovery. */ u32 total_rto_time; /* ms spent in (completed) RTO recoveries. */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Sock_ops bpf program related variables */ #ifdef CONFIG_BPF u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ u8 bpf_chg_cc_inprogress:1; /* In the middle of * bpf_setsockopt(TCP_CONGESTION), * it is to avoid the bpf_tcp_cc->init() * to recur itself by calling * bpf_setsockopt(TCP_CONGESTION, "itself"). */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif u16 timeout_rehash; /* Timeout-triggered rehash attempts */ u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; #ifdef CONFIG_TCP_MD5SIG /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif #ifdef CONFIG_TCP_AO struct tcp_ao_info __rcu *ao_info; #endif #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; }; enum tsq_enum { TSQ_THROTTLED, TSQ_QUEUED, TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ TCP_ACK_DEFERRED, /* TX pure ack is deferred */ }; enum tsq_flags { TSQF_THROTTLED = BIT(TSQ_THROTTLED), TSQF_QUEUED = BIT(TSQ_QUEUED), TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED), TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED), }; #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) /* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. * Used in context of (lockless) tcp listeners. */ #define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt #define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt u32 tw_rcv_wnd; u32 tw_ts_offset; u32 tw_ts_recent; /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif #ifdef CONFIG_TCP_AO struct tcp_ao_info __rcu *ao_info; #endif }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) { return (struct tcp_timewait_sock *)sk; } static inline bool tcp_passive_fastopen(const struct sock *sk) { return sk->sk_state == TCP_SYN_RECV && rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL; } static inline void fastopen_queue_tune(struct sock *sk, int backlog) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn)); } static inline void tcp_move_syn(struct tcp_sock *tp, struct request_sock *req) { tp->saved_syn = req->saved_syn; req->saved_syn = NULL; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) { kfree(tp->saved_syn); tp->saved_syn = NULL; } static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { /* We use READ_ONCE() here because socket might not be locked. * This happens for listeners. */ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); return (user_mss && user_mss < mss) ? user_mss : mss; } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void __tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_user_timeout(struct sock *sk, int val); static inline bool dst_tcp_usec_ts(const struct dst_entry *dst) { return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS); } #endif /* _LINUX_TCP_H */
3121 776 3122 4261 4258 3775 3427 3228 74 74 3534 3532 1586 1583 3473 1 3477 531 532 531 10 2567 22 10 2539 19 3 16 2 1431 1431 1422 788 665 17 17 12982 12277 13138 5 28 12136 41 12361 344 122 240 57 57 178 5729 5729 5733 188 51 51 51 2 2 2 1542 1542 1 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 // SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/random.h> #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> #include <linux/uaccess.h> #include "internal.h" #include "swap.h" /** * kfree_const - conditionally free memory * @x: pointer to the memory * * Function calls kfree only if @x is not in .rodata section. */ void kfree_const(const void *x) { if (!is_kernel_rodata((unsigned long)x)) kfree(x); } EXPORT_SYMBOL(kfree_const); /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s or %NULL in case of error */ noinline char *kstrdup(const char *s, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strlen(s) + 1; buf = kmalloc_track_caller(len, gfp); if (buf) memcpy(buf, s, len); return buf; } EXPORT_SYMBOL(kstrdup); /** * kstrdup_const - conditionally duplicate an existing const string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Strings allocated by kstrdup_const should be freed by kfree_const and * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return kstrdup(s, gfp); } EXPORT_SYMBOL(kstrdup_const); /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strnlen(s, max); buf = kmalloc_track_caller(len+1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kstrndup); /** * kmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kmalloc_track_caller(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kmemdup); /** * kmemdup_array - duplicate a given array. * * @src: array to duplicate. * @element_size: size of each element of array. * @count: number of elements to duplicate from array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } EXPORT_SYMBOL(kmemdup_array); /** * kvmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result may be not physically contiguous. Use kvfree() to free. */ void *kvmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kvmalloc(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kvmemdup); /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { char *buf; if (!s) return NULL; buf = kmalloc_track_caller(len + 1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kmemdup_nul); /** * memdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(memdup_user); /** * vmemdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) { void *p; p = kvmalloc(len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(vmemdup_user); /** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { char *p; long length; length = strnlen_user(s, n); if (!length) return ERR_PTR(-EFAULT); if (length > n) return ERR_PTR(-EINVAL); p = memdup_user(s, length); if (IS_ERR(p)) return p; p[length - 1] = '\0'; return p; } EXPORT_SYMBOL(strndup_user); /** * memdup_user_nul - duplicate memory region from user space and NUL-terminate * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { char *p; /* * Always use GFP_KERNEL, since copy_from_user() can sleep and * cause pagefault, which makes it pointless to use GFP_NOFS * or GFP_ATOMIC. */ p = kmalloc_track_caller(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } /* * Change backing file, only valid to use during initial VMA setup. */ void vma_set_file(struct vm_area_struct *vma, struct file *file) { /* Changing an anonymous vma with this is illegal */ get_file(file); swap(vma->vm_file, file); fput(file); } EXPORT_SYMBOL(vma_set_file); #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP return PAGE_ALIGN(stack_top) + random_variable; #else return PAGE_ALIGN(stack_top) - random_variable; #endif } /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. * @range: The size of the area, starting at @start, within which the * random address must fall. * * If @start + @range would overflow, @range is capped. * * NOTE: Historical use of randomize_range, which this replaces, presumed that * @start was already page aligned. We now align it regardless. * * Return: A page aligned address within [start, start + range). On error, * @start is returned. */ unsigned long randomize_page(unsigned long start, unsigned long range) { if (!PAGE_ALIGNED(start)) { range -= PAGE_ALIGN(start) - start; start = PAGE_ALIGN(start); } if (start > ULONG_MAX - range) range = ULONG_MAX - start; range >>= PAGE_SHIFT; if (range == 0) return start; return start + (get_random_long() % range << PAGE_SHIFT); } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } unsigned long arch_mmap_rnd(void) { unsigned long rnd; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; /* On parisc the stack always grows up - so a unlimited stack should * not be an indicator to use the legacy memory layout. */ if (rlim_stack->rlim_cur == RLIM_INFINITY && !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; } /* * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* * For an upwards growing stack the calculation is much simpler. * Memory for the maximum stack size is reserved at the top of the * task. mmap_base starts directly below the stack and grows * downwards. */ return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); #else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; /* Account for stack randomization if necessary */ if (current->flags & PF_RANDOMIZE) pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); #endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } #endif /** * __account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * @task: task used to check RLIMIT_MEMLOCK * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and * that mmap_lock is held as writer. * * Return: * * 0 on success * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { if (!bypass_rlim) { limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked_vm + pages > limit) ret = -ENOMEM; } if (!ret) mm->locked_vm = locked_vm + pages; } else { WARN_ON_ONCE(pages > locked_vm); mm->locked_vm = locked_vm - pages; } pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } EXPORT_SYMBOL_GPL(__account_locked_vm); /** * account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against, may be NULL * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * * Return: * * 0 on success, or if mm is NULL * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { int ret; if (pages == 0 || !mm) return 0; mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); /** * kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. * However make sure that larger requests are not too disruptive - no * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; /* nofail semantic is implemented by the vmalloc fallback */ kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ if (ret || size <= PAGE_SIZE) return ret; /* non-sleeping allocations are not supported by vmalloc */ if (!gfpflags_allow_blocking(flags)) return NULL; /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } /* * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, * since the callers already cannot assume anything * about the resulting pointer, and cannot play * protection games. */ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); /** * kvfree() - Free memory. * @addr: Pointer to allocated memory. * * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); } EXPORT_SYMBOL(kvfree); /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. * @len: length of the data object. * * Use the special memzero_explicit() function to clear the content of a * kvmalloc'ed object containing sensitive data to make sure that the * compiler won't optimize out the data clearing. */ void kvfree_sensitive(const void *addr, size_t len) { if (likely(!ZERO_OR_NULL_PTR(addr))) { memzero_explicit((void *)addr, len); kvfree(addr); } } EXPORT_SYMBOL(kvfree_sensitive); void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { void *newp; if (oldsize >= newsize) return (void *)p; newp = kvmalloc(newsize, flags); if (!newp) return NULL; memcpy(newp, p, oldsize); kvfree(p); return newp; } EXPORT_SYMBOL(kvrealloc); /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return __vmalloc(bytes, flags); } EXPORT_SYMBOL(__vmalloc_array); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vmalloc_array(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_array); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vcalloc(size_t n, size_t size, gfp_t flags) { return __vmalloc_array(n, size, flags | __GFP_ZERO); } EXPORT_SYMBOL(__vcalloc); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vcalloc(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vcalloc); struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; return (void *)(mapping - PAGE_MAPPING_ANON); } /** * folio_mapping - Find the mapping where this folio is stored. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Folios in the swap cache return the swap mapping * this page is stored in (which is different from the mapping for the * swap file or swap device where the data is stored). * * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(folio_test_slab(folio))) return NULL; if (unlikely(folio_test_swapcache(folio))) return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; return mapping; } EXPORT_SYMBOL(folio_mapping); /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. * @src: Folio to copy from. * * The bytes in the folio represented by @src are copied to @dst. * Assumes the caller has validated that @dst is at least as large as @src. * Can be called in atomic context for order-0 folios, but if the folio is * larger, it may sleep. */ void folio_copy(struct folio *dst, struct folio *src) { long i = 0; long nr = folio_nr_pages(src); for (;;) { copy_highpage(folio_page(dst, i), folio_page(src, i)); if (++i == nr) break; cond_resched(); } } EXPORT_SYMBOL(folio_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_kbytes = 0; return ret; } static void sync_overcommit_as(struct work_struct *dummy) { percpu_counter_sync(&vm_committed_as); } int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch * 2. sync percpu count on each CPU * 3. switch the policy */ if (write) { t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); if (new_policy == OVERCOMMIT_NEVER) schedule_on_each_cpu(sync_overcommit_as); sysctl_overcommit_memory = new_policy; } else { ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } return ret; } int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_ratio = 0; return ret; } /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { unsigned long allowed; if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; return allowed; } /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * The global memory commitment made in the system can be a metric * that can be used to drive ballooning decisions when Linux is hosted * as a guest. On Hyper-V, the host implements a policy engine for dynamically * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. * * The time cost of this is very low for small platforms, and for big * platform like a 2S/36C/72T Skylake server, in worst case where * vm_committed_as's spinlock is under severe contention, the time cost * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; } allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); /* * Don't let a single process grow so big a user can't recover */ if (mm) { long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n", __func__, current->pid, current->comm); vm_unacct_memory(pages); return -ENOMEM; } /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. * * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) { int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); len = arg_end - arg_start; if (len > buflen) len = buflen; res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then * assume application is using setproctitle(3). */ if (res > 0 && buffer[res-1] != '\0' && len < buflen) { len = strnlen(buffer, res); if (len < res) { res = len; } else { len = env_end - env_start; if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, buffer+res, len, FOLL_FORCE); res = strnlen(buffer, res); } } out_mm: mmput(mm); out: return res; } int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; addr1 = kmap_local_page(page1); addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); kunmap_local(addr2); kunmap_local(addr1); return ret; } #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. * * This function uses pr_cont(), so that the caller is expected to have * printed out whatever preamble is appropriate. The provenance information * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation * and last free path of that object. */ void mem_dump_obj(void *object) { const char *type; if (kmem_dump_obj(object)) return; if (vmalloc_dump_obj(object)) return; if (is_vmalloc_addr(object)) type = "vmalloc memory"; else if (virt_addr_valid(object)) type = "non-slab/vmalloc memory"; else if (object == NULL) type = "NULL pointer"; else if (object == ZERO_SIZE_PTR) type = "zero-size pointer"; else type = "non-paged memory"; pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif /* * A driver might set a page logically offline -- PageOffline() -- and * turn the page inaccessible in the hypervisor; after that, access to page * content can be fatal. * * Some special PFN walkers -- i.e., /proc/kcore -- read content of random * pages after checking PageOffline(); however, these PFN walkers can race * with drivers that set PageOffline(). * * page_offline_freeze()/page_offline_thaw() allows for a subsystem to * synchronize with such drivers, achieving that a page cannot be set * PageOffline() while frozen. * * page_offline_begin()/page_offline_end() is used by drivers that care about * such races when setting a page PageOffline(). */ static DECLARE_RWSEM(page_offline_rwsem); void page_offline_freeze(void) { down_read(&page_offline_rwsem); } void page_offline_thaw(void) { up_read(&page_offline_rwsem); } void page_offline_begin(void) { down_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_begin); void page_offline_end(void) { up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); #ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) flush_dcache_page(folio_page(folio, i)); } EXPORT_SYMBOL(flush_dcache_folio); #endif
1 1 45 45 33 34 1 1 1 1 1 1 1 1 1 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 40 40 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "volumes.h" #include "locking.h" #include "btrfs_inode.h" #include "async-thread.h" #include "free-space-cache.h" #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" #include "block-group.h" #include "backref.h" #include "misc.h" #include "subpage.h" #include "zoned.h" #include "inode-item.h" #include "space-info.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "relocation.h" #include "super.h" #include "tree-checker.h" /* * Relocation overview * * [What does relocation do] * * The objective of relocation is to relocate all extents of the target block * group to other block groups. * This is utilized by resize (shrink only), profile converting, compacting * space, or balance routine to spread chunks over devices. * * Before | After * ------------------------------------------------------------------ * BG A: 10 data extents | BG A: deleted * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) * * [How does relocation work] * * 1. Mark the target block group read-only * New extents won't be allocated from the target block group. * * 2.1 Record each extent in the target block group * To build a proper map of extents to be relocated. * * 2.2 Build data reloc tree and reloc trees * Data reloc tree will contain an inode, recording all newly relocated * data extents. * There will be only one data reloc tree for one data block group. * * Reloc tree will be a special snapshot of its source tree, containing * relocated tree blocks. * Each tree referring to a tree block in target block group will get its * reloc tree built. * * 2.3 Swap source tree with its corresponding reloc tree * Each involved tree only refers to new extents after swap. * * 3. Cleanup reloc trees and data reloc tree. * As old extents in the target block group are still referenced by reloc * trees, we need to clean them up before really freeing the target block * group. * * The main complexity is in steps 2.2 and 2.3. * * The entry point of relocation is relocate_block_group() function. */ #define RELOCATION_RESERVED_NODES 256 /* * map address of tree root to tree */ struct mapping_node { struct { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simle_node for search/insert */ void *data; }; struct mapping_tree { struct rb_root rb_root; spinlock_t lock; }; /* * present a tree block to process */ struct tree_block { struct { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simple_node for search/insert */ u64 owner; struct btrfs_key key; u8 level; bool key_ready; }; #define MAX_EXTENTS 128 struct file_extent_cluster { u64 start; u64 end; u64 boundary[MAX_EXTENTS]; unsigned int nr; u64 owning_root; }; /* Stages of data relocation. */ enum reloc_stage { MOVE_DATA_EXTENTS, UPDATE_DATA_PTRS }; struct reloc_control { /* block group to relocate */ struct btrfs_block_group *block_group; /* extent tree */ struct btrfs_root *extent_root; /* inode for moving data */ struct inode *data_inode; struct btrfs_block_rsv *block_rsv; struct btrfs_backref_cache backref_cache; struct file_extent_cluster cluster; /* tree blocks have been processed */ struct extent_io_tree processed_blocks; /* map start of tree root to corresponding reloc tree */ struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; /* list of subvolume trees that get relocated */ struct list_head dirty_subvol_roots; /* size of metadata reservation for merging reloc trees */ u64 merging_rsv_size; /* size of relocated tree nodes */ u64 nodes_relocated; /* reserved size for block group relocation*/ u64 reserved_bytes; u64 search_start; u64 extents_found; enum reloc_stage stage; bool create_reloc_tree; bool merge_reloc_tree; bool found_file_extent; }; static void mark_block_processed(struct reloc_control *rc, struct btrfs_backref_node *node) { u32 blocksize; if (node->level == 0 || in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; set_extent_bit(&rc->processed_blocks, node->bytenr, node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); } node->processed = 1; } /* * walk up backref nodes until reach node presents tree root */ static struct btrfs_backref_node *walk_up_backref( struct btrfs_backref_node *node, struct btrfs_backref_edge *edges[], int *index) { struct btrfs_backref_edge *edge; int idx = *index; while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } BUG_ON(node->detached); *index = idx; return node; } /* * walk down backref nodes to find start of next reference path */ static struct btrfs_backref_node *walk_down_backref( struct btrfs_backref_edge *edges[], int *index) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; int idx = *index; while (idx > 0) { edge = edges[idx - 1]; lower = edge->node[LOWER]; if (list_is_last(&edge->list[LOWER], &lower->upper)) { idx--; continue; } edge = list_entry(edge->list[LOWER].next, struct btrfs_backref_edge, list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; } *index = 0; return NULL; } static void update_backref_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node, u64 bytenr) { struct rb_node *rb_node; rb_erase(&node->rb_node, &cache->rb_root); node->bytenr = bytenr; rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); if (rb_node) btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); } /* * update backref cache after a transaction commit */ static int update_backref_cache(struct btrfs_trans_handle *trans, struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; int level = 0; if (cache->last_trans == 0) { cache->last_trans = trans->transid; return 0; } if (cache->last_trans == trans->transid) return 0; /* * detached nodes are used to avoid unnecessary backref * lookup. transaction commit changes the extent tree. * so the detached nodes are no longer useful. */ while (!list_empty(&cache->detached)) { node = list_entry(cache->detached.next, struct btrfs_backref_node, list); btrfs_backref_cleanup_node(cache, node); } while (!list_empty(&cache->changed)) { node = list_entry(cache->changed.next, struct btrfs_backref_node, list); list_del_init(&node->list); BUG_ON(node->pending); update_backref_node(cache, node, node->new_bytenr); } /* * some nodes can be left in the pending list if there were * errors during processing the pending nodes. */ for (level = 0; level < BTRFS_MAX_LEVEL; level++) { list_for_each_entry(node, &cache->pending[level], list) { BUG_ON(!node->pending); if (node->bytenr == node->new_bytenr) continue; update_backref_node(cache, node, node->new_bytenr); } } cache->last_trans = 0; return 1; } static bool reloc_root_is_dead(const struct btrfs_root *root) { /* * Pair with set_bit/clear_bit in clean_dirty_subvols and * btrfs_update_reloc_root. We need to see the updated bit before * trying to access reloc_root */ smp_rmb(); if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) return true; return false; } /* * Check if this subvolume tree has valid reloc tree. * * Reloc tree after swap is considered dead, thus not considered as valid. * This is enough for most callers, as they don't distinguish dead reloc root * from no reloc root. But btrfs_should_ignore_reloc_root() below is a * special case. */ static bool have_reloc_root(const struct btrfs_root *root) { if (reloc_root_is_dead(root)) return false; if (!root->reloc_root) return false; return true; } bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) { struct btrfs_root *reloc_root; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return false; /* This root has been merged with its reloc tree, we can ignore it */ if (reloc_root_is_dead(root)) return true; reloc_root = root->reloc_root; if (!reloc_root) return false; if (btrfs_header_generation(reloc_root->commit_root) == root->fs_info->running_transaction->transid) return false; /* * If there is reloc tree and it was created in previous transaction * backref lookup can find the reloc tree, so backref node for the fs * tree root is useless for relocation. */ return true; } /* * find reloc tree by address of tree root */ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct reloc_control *rc = fs_info->reloc_ctl; struct rb_node *rb_node; struct mapping_node *node; struct btrfs_root *root = NULL; ASSERT(rc); spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); root = node->data; } spin_unlock(&rc->reloc_root_tree.lock); return btrfs_grab_root(root); } /* * For useless nodes, do two major clean ups: * * - Cleanup the children edges and nodes * If child node is also orphan (no parent) during cleanup, then the child * node will also be cleaned up. * * - Freeing up leaves (level 0), keeps nodes detached * For nodes, the node is still cached as "detached" * * Return false if @node is not in the @useless_nodes list. * Return true if @node is in the @useless_nodes list. */ static bool handle_useless_nodes(struct reloc_control *rc, struct btrfs_backref_node *node) { struct btrfs_backref_cache *cache = &rc->backref_cache; struct list_head *useless_node = &cache->useless_node; bool ret = false; while (!list_empty(useless_node)) { struct btrfs_backref_node *cur; cur = list_first_entry(useless_node, struct btrfs_backref_node, list); list_del_init(&cur->list); /* Only tree root nodes can be added to @useless_nodes */ ASSERT(list_empty(&cur->upper)); if (cur == node) ret = true; /* The node is the lowest node */ if (cur->lowest) { list_del_init(&cur->lower); cur->lowest = 0; } /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; edge = list_entry(cur->lower.next, struct btrfs_backref_edge, list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; btrfs_backref_free_edge(cache, edge); /* Child node is also orphan, queue for cleanup */ if (list_empty(&lower->upper)) list_add(&lower->list, useless_node); } /* Mark this block processed for relocation */ mark_block_processed(rc, cur); /* * Backref nodes for tree leaves are deleted from the cache. * Backref nodes for upper level tree blocks are left in the * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { list_add(&cur->list, &cache->detached); cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); btrfs_backref_free_node(cache, cur); } } return ret; } /* * Build backref tree for a given tree block. Root of the backref tree * corresponds the tree block, leaves of the backref tree correspond roots of * b-trees that reference the tree block. * * The basic idea of this function is check backrefs of a given block to find * upper level blocks that reference the block, and then check backrefs of * these upper level blocks recursively. The recursion stops when tree root is * reached or backrefs for the block is cached. * * NOTE: if we find that backrefs for a block are cached, we know backrefs for * all upper level blocks that directly/indirectly reference the block are also * cached. */ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_key *node_key, int level, u64 bytenr) { struct btrfs_backref_iter *iter; struct btrfs_backref_cache *cache = &rc->backref_cache; /* For searching parent of TREE_BLOCK_REF */ struct btrfs_path *path; struct btrfs_backref_node *cur; struct btrfs_backref_node *node = NULL; struct btrfs_backref_edge *edge; int ret; int err = 0; iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out; } node = btrfs_backref_alloc_node(cache, bytenr, level); if (!node) { err = -ENOMEM; goto out; } node->lowest = 1; cur = node; /* Breadth-first search to build backref cache */ do { ret = btrfs_backref_add_tree_node(trans, cache, path, iter, node_key, cur); if (ret < 0) { err = ret; goto out; } edge = list_first_entry_or_null(&cache->pending_edge, struct btrfs_backref_edge, list[UPPER]); /* * The pending list isn't empty, take the first block to * process */ if (edge) { list_del_init(&edge->list[UPPER]); cur = edge->node[UPPER]; } } while (edge); /* Finish the upper linkage of newly added edges/nodes */ ret = btrfs_backref_finish_upper_links(cache, node); if (ret < 0) { err = ret; goto out; } if (handle_useless_nodes(rc, node)) node = NULL; out: btrfs_backref_iter_free(iter); btrfs_free_path(path); if (err) { btrfs_backref_error_cleanup(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); ASSERT(list_empty(&cache->useless_node) && list_empty(&cache->pending_edge)); return node; } /* * helper to add backref node for the newly created snapshot. * the backref node is created by cloning backref node that * corresponds to root of source tree */ static int clone_backref_node(struct btrfs_trans_handle *trans, struct reloc_control *rc, const struct btrfs_root *src, struct btrfs_root *dest) { struct btrfs_root *reloc_root = src->reloc_root; struct btrfs_backref_cache *cache = &rc->backref_cache; struct btrfs_backref_node *node = NULL; struct btrfs_backref_node *new_node; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *new_edge; struct rb_node *rb_node; if (cache->last_trans > 0) update_backref_cache(trans, cache); rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); if (node->detached) node = NULL; else BUG_ON(node->new_bytenr != reloc_root->node->start); } if (!node) { rb_node = rb_simple_search(&cache->rb_root, reloc_root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); BUG_ON(node->detached); } } if (!node) return 0; new_node = btrfs_backref_alloc_node(cache, dest->node->start, node->level); if (!new_node) return -ENOMEM; new_node->lowest = node->lowest; new_node->checked = 1; new_node->root = btrfs_grab_root(dest); ASSERT(new_node->root); if (!node->lowest) { list_for_each_entry(edge, &node->lower, list[UPPER]) { new_edge = btrfs_backref_alloc_edge(cache); if (!new_edge) goto fail; btrfs_backref_link_edge(new_edge, edge->node[LOWER], new_node, LINK_UPPER); } } else { list_add_tail(&new_node->lower, &cache->leaves); } rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, &new_node->rb_node); if (rb_node) btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); if (!new_node->lowest) { list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { list_add_tail(&new_edge->list[LOWER], &new_edge->node[LOWER]->upper); } } return 0; fail: while (!list_empty(&new_node->lower)) { new_edge = list_entry(new_node->lower.next, struct btrfs_backref_edge, list[UPPER]); list_del(&new_edge->list[UPPER]); btrfs_backref_free_edge(cache, new_edge); } btrfs_backref_free_node(cache, new_node); return -ENOMEM; } /* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node; struct reloc_control *rc = fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) return -ENOMEM; node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { btrfs_err(fs_info, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); return 0; } /* * helper to delete the 'address of tree root -> reloc tree' * mapping */ static void __del_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; bool put_ref = false; if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); ASSERT(!node || (struct btrfs_root *)node->data == root); } /* * We only put the reloc root here if it's on the list. There's a lot * of places where the pattern is to splice the rc->reloc_roots, process * the reloc roots, and then add the reloc root back onto * rc->reloc_roots. If we call __del_reloc_root while it's off of the * list we don't want the reference being dropped, because the guy * messing with the list is in charge of the reference. */ spin_lock(&fs_info->trans_lock); if (!list_empty(&root->root_list)) { put_ref = true; list_del_init(&root->root_list); } spin_unlock(&fs_info->trans_lock); if (put_ref) btrfs_put_root(root); kfree(node); } /* * helper to update the 'address of tree root -> reloc tree' * mapping */ static int __update_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; spin_lock(&rc->reloc_root_tree.lock); rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } spin_unlock(&rc->reloc_root_tree.lock); if (!node) return 0; BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); node->bytenr = root->node->start; rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); return 0; } static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; int ret = 0; bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); if (!root_item) return ERR_PTR(-ENOMEM); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = objectid; if (root->root_key.objectid == objectid) { u64 commit_root_gen; /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) goto fail; /* * Set the last_snapshot field to the generation of the commit * root - like this ctree.c:btrfs_block_can_be_shared() behaves * correctly (returns true) when the relocation root is created * either inside the critical section of a transaction commit * (through transaction.c:qgroup_account_snapshot()) and when * it's created before the transaction commit is started. */ commit_root_gen = btrfs_header_generation(root->commit_root); btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); } else { /* * called by btrfs_reloc_post_snapshot_hook. * the source tree is a reloc tree, all tree blocks * modified after it was created have RELOC flag * set in their headers. so it's OK to not update * the 'last_snapshot'. */ ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) goto fail; } /* * We have changed references at this point, we must abort the * transaction if anything fails. */ must_abort = true; memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); if (root->root_key.objectid == objectid) { btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); btrfs_set_root_drop_level(root_item, 0); } btrfs_tree_unlock(eb); free_extent_buffer(eb); ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); if (ret) goto fail; kfree(root_item); reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); if (IS_ERR(reloc_root)) { ret = PTR_ERR(reloc_root); goto abort; } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; fail: kfree(root_item); abort: if (must_abort) btrfs_abort_transaction(trans, ret); return ERR_PTR(ret); } /* * create reloc tree for a given fs tree. reloc tree is just a * snapshot of the fs tree with special root objectid. * * The reloc_root comes out of here with two references, one for * root->reloc_root, and another for being on the rc->reloc_roots list. */ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct reloc_control *rc = fs_info->reloc_ctl; struct btrfs_block_rsv *rsv; int clear_rsv = 0; int ret; if (!rc) return 0; /* * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree */ if (reloc_root_is_dead(root)) return 0; /* * This is subtle but important. We do not do * record_root_in_transaction for reloc roots, instead we record their * corresponding fs root, and then here we update the last trans for the * reloc root. This means that we have to do this for the entire life * of the reloc root, regardless of which stage of the relocation we are * in. */ if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; return 0; } /* * We are merging reloc roots, we do not need new reloc trees. Also * reloc trees never need their own reloc tree. */ if (!rc->create_reloc_tree || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) return 0; if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; clear_rsv = 1; } reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) trans->block_rsv = rsv; if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); ASSERT(ret != -EEXIST); if (ret) { /* Pairs with create_reloc_root */ btrfs_put_root(reloc_root); return ret; } root->reloc_root = btrfs_grab_root(reloc_root); return 0; } /* * update root item of reloc tree */ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; int ret; if (!have_reloc_root(root)) return 0; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; /* * We are probably ok here, but __del_reloc_root() will drop its ref of * the root. We have the ref for root->reloc_root, but just in case * hold it while we update the reloc root. */ btrfs_grab_root(reloc_root); /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); /* * Mark the tree as dead before we change reloc_root so * have_reloc_root will not touch it from now on. */ smp_wmb(); __del_reloc_root(reloc_root); } if (reloc_root->commit_root != reloc_root->node) { __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); } ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); btrfs_put_root(reloc_root); return ret; } /* * helper to find first cached inode with inode number >= objectid * in a subvolume */ static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) { struct rb_node *node; struct rb_node *prev; struct btrfs_inode *entry; struct inode *inode; spin_lock(&root->inode_lock); again: node = root->inode_tree.rb_node; prev = NULL; while (node) { prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); if (objectid < btrfs_ino(entry)) node = node->rb_left; else if (objectid > btrfs_ino(entry)) node = node->rb_right; else break; } if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); if (objectid <= btrfs_ino(entry)) { node = prev; break; } prev = rb_next(prev); } } while (node) { entry = rb_entry(node, struct btrfs_inode, rb_node); inode = igrab(&entry->vfs_inode); if (inode) { spin_unlock(&root->inode_lock); return inode; } objectid = btrfs_ino(entry) + 1; if (cond_resched_lock(&root->inode_lock)) goto again; node = rb_next(node); } spin_unlock(&root->inode_lock); return NULL; } /* * get new location of data */ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, u64 bytenr, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(reloc_inode)->root; struct btrfs_path *path; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; bytenr -= BTRFS_I(reloc_inode)->index_cnt; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); BUG_ON(btrfs_file_extent_offset(leaf, fi) || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)); if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { ret = -EINVAL; goto out; } *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ret = 0; out: btrfs_free_path(path); return ret; } /* * update file extent items in the tree leaf to point to * the new locations. */ static noinline_for_stack int replace_file_extents(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *root, struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_file_extent_item *fi; struct inode *inode = NULL; u64 parent; u64 bytenr; u64 new_bytenr = 0; u64 num_bytes; u64 end; u32 nritems; u32 i; int ret = 0; int first = 1; int dirty = 0; if (rc->stage != UPDATE_DATA_PTRS) return 0; /* reloc trees always use full backref */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) parent = leaf->start; else parent = 0; nritems = btrfs_header_nritems(leaf); for (i = 0; i < nritems; i++) { struct btrfs_ref ref = { 0 }; cond_resched(); btrfs_item_key_to_cpu(leaf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); if (bytenr == 0) continue; if (!in_range(bytenr, rc->block_group->start, rc->block_group->length)) continue; /* * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { if (first) { inode = find_next_inode(root, key.objectid); first = 0; } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { btrfs_add_delayed_iput(BTRFS_I(inode)); inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { struct extent_state *cached_state = NULL; end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, fs_info->sectorsize)); WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, key.offset, end, &cached_state); if (!ret) continue; btrfs_drop_extent_map_range(BTRFS_I(inode), key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, end, &cached_state); } } ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); if (ret) { /* * Don't have to abort since we've not changed anything * in the file extent yet. */ break; } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } } if (dirty) btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, int slot, const struct btrfs_path *path, int level) { struct btrfs_disk_key key1; struct btrfs_disk_key key2; btrfs_node_key(eb, &key1, slot); btrfs_node_key(path->nodes[level], &key2, path->slots[level]); return memcmp(&key1, &key2, sizeof(key1)); } /* * try to replace tree blocks in fs tree with the new blocks * in reloc tree. tree blocks haven't been modified since the * reloc tree was create can be replaced. * * if a block was replaced, level of the block + 1 is returned. * if no block got replaced, 0 is returned. if there are other * errors, a negative error number is returned. */ static noinline_for_stack int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *dest, struct btrfs_root *src, struct btrfs_path *path, struct btrfs_key *next_key, int lowest_level, int max_level) { struct btrfs_fs_info *fs_info = dest->fs_info; struct extent_buffer *eb; struct extent_buffer *parent; struct btrfs_ref ref = { 0 }; struct btrfs_key key; u64 old_bytenr; u64 new_bytenr; u64 old_ptr_gen; u64 new_ptr_gen; u64 last_snapshot; u32 blocksize; int cow = 0; int level; int ret; int slot; ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: slot = path->slots[lowest_level]; btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); level = btrfs_header_level(eb); if (level < lowest_level) { btrfs_tree_unlock(eb); free_extent_buffer(eb); return 0; } if (cow) { ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(eb); free_extent_buffer(eb); return ret; } } if (next_key) { next_key->objectid = (u64)-1; next_key->type = (u8)-1; next_key->offset = (u64)-1; } parent = eb; while (1) { level = btrfs_header_level(parent); ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, 0, &key, &slot); if (ret < 0) break; if (ret && slot > 0) slot--; if (next_key && slot + 1 < btrfs_header_nritems(parent)) btrfs_node_key_to_cpu(parent, next_key, slot + 1); old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); if (level <= max_level) { eb = path->nodes[level]; new_bytenr = btrfs_node_blockptr(eb, path->slots[level]); new_ptr_gen = btrfs_node_ptr_generation(eb, path->slots[level]); } else { new_bytenr = 0; new_ptr_gen = 0; } if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { ret = level; break; } if (new_bytenr == 0 || old_ptr_gen > last_snapshot || memcmp_node_keys(parent, slot, path, level)) { if (level <= lowest_level) { ret = 0; break; } eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; } btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, slot, &eb, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(eb); free_extent_buffer(eb); break; } } btrfs_tree_unlock(parent); free_extent_buffer(parent); parent = eb; continue; } if (!cow) { btrfs_tree_unlock(parent); free_extent_buffer(parent); cow = 1; goto again; } btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); btrfs_release_path(path); path->lowest_level = level; set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); ret = btrfs_search_slot(trans, src, &key, path, 0, 1); clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); path->lowest_level = 0; if (ret) { if (ret > 0) ret = -ENOENT; break; } /* * Info qgroup to trace both subtrees. * * We must trace both trees. * 1) Tree reloc subtree * If not traced, we will leak data numbers * 2) Fs subtree * If not traced, we will double count old data * * We don't scan the subtree right now, but only record * the swapped tree blocks. * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ ret = btrfs_qgroup_add_swapped_blocks(trans, dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); if (ret < 0) break; /* * swap blocks in fs tree and reloc tree. */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(trans, path->nodes[level]); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0, dest->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start, 0); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } btrfs_unlock_up_safe(path, 0); ret = level; break; } btrfs_tree_unlock(parent); free_extent_buffer(parent); return ret; } /* * helper to find next relocated block in reloc tree */ static noinline_for_stack int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { struct extent_buffer *eb; int i; u64 last_snapshot; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = 0; i < *level; i++) { free_extent_buffer(path->nodes[i]); path->nodes[i] = NULL; } for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] + 1 < nritems) { path->slots[i]++; if (btrfs_node_ptr_generation(eb, path->slots[i]) <= last_snapshot) continue; *level = i; return 0; } free_extent_buffer(path->nodes[i]); path->nodes[i] = NULL; } return 1; } /* * walk down reloc tree to find relocated block of lowest level */ static noinline_for_stack int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { struct extent_buffer *eb = NULL; int i; u64 ptr_gen = 0; u64 last_snapshot; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = *level; i > 0; i--) { eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] < nritems) { ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); if (ptr_gen > last_snapshot) break; path->slots[i]++; } if (path->slots[i] >= nritems) { if (i == *level) break; *level = i + 1; return 0; } if (i == 1) { *level = i; return 0; } eb = btrfs_read_node_slot(eb, path->slots[i]); if (IS_ERR(eb)) return PTR_ERR(eb); BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; } return 1; } /* * invalidate extent cache for file extents whose key in range of * [min_key, max_key) */ static int invalidate_extent_cache(struct btrfs_root *root, const struct btrfs_key *min_key, const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = NULL; u64 objectid; u64 start, end; u64 ino; objectid = min_key->objectid; while (1) { struct extent_state *cached_state = NULL; cond_resched(); iput(inode); if (objectid > max_key->objectid) break; inode = find_next_inode(root, objectid); if (!inode) break; ino = btrfs_ino(BTRFS_I(inode)); if (ino > max_key->objectid) { iput(inode); break; } objectid = ino + 1; if (!S_ISREG(inode->i_mode)) continue; if (unlikely(min_key->objectid == ino)) { if (min_key->type > BTRFS_EXTENT_DATA_KEY) continue; if (min_key->type < BTRFS_EXTENT_DATA_KEY) start = 0; else { start = min_key->offset; WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); } } else { start = 0; } if (unlikely(max_key->objectid == ino)) { if (max_key->type < BTRFS_EXTENT_DATA_KEY) continue; if (max_key->type > BTRFS_EXTENT_DATA_KEY) { end = (u64)-1; } else { if (max_key->offset == 0) continue; end = max_key->offset; WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; } } else { end = (u64)-1; } /* the lock_extent waits for read_folio to complete */ lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); } return 0; } static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) { while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) break; if (path->slots[level] + 1 < btrfs_header_nritems(path->nodes[level])) { btrfs_node_key_to_cpu(path->nodes[level], key, path->slots[level] + 1); return 0; } level++; } return 1; } /* * Insert current subvolume into reloc_control::dirty_subvol_roots */ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *root) { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; int ret; /* @root must be a subvolume tree root with a valid reloc tree */ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); ASSERT(reloc_root); reloc_root_item = &reloc_root->root_item; memset(&reloc_root_item->drop_progress, 0, sizeof(reloc_root_item->drop_progress)); btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); ret = btrfs_update_reloc_root(trans, root); if (ret) return ret; if (list_empty(&root->reloc_dirty_list)) { btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } return 0; } static int clean_dirty_subvols(struct reloc_control *rc) { struct btrfs_root *root; struct btrfs_root *next; int ret = 0; int ret2; list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, reloc_dirty_list) { if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; /* * Need barrier to ensure clear_bit() only happens after * root->reloc_root = NULL. Pairs with have_reloc_root. */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); if (reloc_root) { /* * btrfs_drop_snapshot drops our ref we hold for * ->reloc_root. If it fails however we must * drop the ref ourselves. */ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); if (ret2 < 0) { btrfs_put_root(reloc_root); if (!ret) ret = ret2; } } btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ ret2 = btrfs_drop_snapshot(root, 0, 1); if (ret2 < 0) { btrfs_put_root(root); if (!ret) ret = ret2; } } } return ret; } /* * merge the relocated tree blocks in reloc tree with corresponding * fs tree. */ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_key key; struct btrfs_key next_key; struct btrfs_trans_handle *trans = NULL; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; int reserve_level; int level; int max_level; int replaced = 0; int ret = 0; u32 min_reserved; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); atomic_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); path->lowest_level = 0; if (ret < 0) { btrfs_free_path(path); return ret; } btrfs_node_key_to_cpu(path->nodes[level], &next_key, path->slots[level]); WARN_ON(memcmp(&key, &next_key, sizeof(key))); btrfs_unlock_up_safe(path, 0); } /* * In merge_reloc_root(), we modify the upper level pointer to swap the * tree blocks between reloc tree and subvolume tree. Thus for tree * block COW, we COW at most from level 1 to root level for each tree. * * Thus the needed metadata size is at most root_level * nodesize, * and * 2 since we have two trees to COW. */ reserve_level = max_t(int, 1, btrfs_root_level(root_item)); min_reserved = fs_info->nodesize * reserve_level * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) goto out; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } /* * At this point we no longer have a reloc_control, so we can't * depend on btrfs_init_reloc_root to update our last_trans. * * But that's ok, we started the trans handle on our * corresponding fs_root, which means it's been added to the * dirty list. At commit time we'll still call * btrfs_update_reloc_root() and update our root item * appropriately. */ reloc_root->last_trans = trans->transid; trans->block_rsv = rc->block_rsv; replaced = 0; max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); if (ret < 0) goto out; if (ret > 0) break; if (!find_next_key(path, level, &key) && btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; } else { ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } if (ret < 0) goto out; if (ret > 0) { level = ret; btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); replaced = 1; } ret = walk_up_reloc_tree(reloc_root, path, &level); if (ret > 0) break; BUG_ON(level == 0); /* * save the merging progress in the drop_progress. * this is OK since root refs == 1 in this case. */ btrfs_node_key(path->nodes[level], &root_item->drop_progress, path->slots[level]); btrfs_set_root_drop_level(root_item, level); btrfs_end_transaction_throttle(trans); trans = NULL; btrfs_btree_balance_dirty(fs_info); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } /* * handle the case only one block in the fs tree need to be * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); out: btrfs_free_path(path); if (ret == 0) { ret = insert_dirty_subvol(trans, rc, root); if (ret) btrfs_abort_transaction(trans, ret); } if (trans) btrfs_end_transaction_throttle(trans); btrfs_btree_balance_dirty(fs_info); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); return ret; } static noinline_for_stack int prepare_to_merge(struct reloc_control *rc, int err) { struct btrfs_root *root = rc->extent_root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct btrfs_trans_handle *trans; LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; mutex_lock(&fs_info->reloc_mutex); rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; mutex_unlock(&fs_info->reloc_mutex); again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; } trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(fs_info, rc->block_rsv, num_bytes, NULL); return PTR_ERR(trans); } if (!err) { if (num_bytes != rc->merging_rsv_size) { btrfs_end_transaction(trans); btrfs_block_rsv_release(fs_info, rc->block_rsv, num_bytes, NULL); goto again; } } rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { reloc_root = list_entry(rc->reloc_roots.next, struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(root)) { /* * Even if we have an error we need this reloc root * back on our list so we can clean up properly. */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_abort_transaction(trans, (int)PTR_ERR(root)); if (!err) err = PTR_ERR(root); break; } if (unlikely(root->reloc_root != reloc_root)) { if (root->reloc_root) { btrfs_err(fs_info, "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", root->root_key.objectid, root->reloc_root->root_key.objectid, root->reloc_root->root_key.type, root->reloc_root->root_key.offset, btrfs_root_generation( &root->reloc_root->root_item), reloc_root->root_key.objectid, reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( &reloc_root->root_item)); } else { btrfs_err(fs_info, "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", root->root_key.objectid, reloc_root->root_key.objectid, reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( &reloc_root->root_item)); } list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); btrfs_abort_transaction(trans, -EUCLEAN); if (!err) err = -EUCLEAN; break; } /* * set reference count to 1, so btrfs_recover_relocation * knows it should resumes merging */ if (!err) btrfs_set_root_refs(&reloc_root->root_item, 1); ret = btrfs_update_reloc_root(trans, root); /* * Even if we have an error we need this reloc root back on our * list so we can clean up properly. */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); if (ret) { btrfs_abort_transaction(trans, ret); if (!err) err = ret; break; } } list_splice(&reloc_roots, &rc->reloc_roots); if (!err) err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; } static noinline_for_stack void free_reloc_roots(struct list_head *list) { struct btrfs_root *reloc_root, *tmp; list_for_each_entry_safe(reloc_root, tmp, list, root_list) __del_reloc_root(reloc_root); } static noinline_for_stack void merge_reloc_roots(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_root *root; struct btrfs_root *reloc_root; LIST_HEAD(reloc_roots); int found = 0; int ret = 0; again: root = rc->extent_root; /* * this serializes us with btrfs_record_root_in_transaction, * we have to make sure nobody is in the middle of * adding their roots to the list while we are * doing this splice */ mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); mutex_unlock(&fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { if (WARN_ON(IS_ERR(root))) { /* * For recovery we read the fs roots on mount, * and if we didn't find the root then we marked * the reloc root as a garbage root. For normal * relocation obviously the root should exist in * memory. However there's no reason we can't * handle the error properly here just in case. */ ret = PTR_ERR(root); goto out; } if (WARN_ON(root->reloc_root != reloc_root)) { /* * This can happen if on-disk metadata has some * corruption, e.g. bad reloc tree key offset. */ ret = -EINVAL; goto out; } ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; } } else { if (!IS_ERR(root)) { if (root->reloc_root == reloc_root) { root->reloc_root = NULL; btrfs_put_root(reloc_root); } clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_root(root); } list_del_init(&reloc_root->root_list); /* Don't forget to queue this reloc root for cleanup */ list_add_tail(&reloc_root->reloc_dirty_list, &rc->dirty_subvol_roots); } } if (found) { found = 0; goto again; } out: if (ret) { btrfs_handle_fs_error(fs_info, ret, NULL); free_reloc_roots(&reloc_roots); /* new reloc root may be added */ mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); mutex_unlock(&fs_info->reloc_mutex); free_reloc_roots(&reloc_roots); } /* * We used to have * * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); * * here, but it's wrong. If we fail to start the transaction in * prepare_to_merge() we will have only 0 ref reloc roots, none of which * have actually been removed from the reloc_root_tree rb tree. This is * fine because we're bailing here, and we hold a reference on the root * for the list that holds it, so these roots will be cleaned up when we * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root * will be cleaned up on unmount. * * The remaining nodes will be cleaned up by free_reloc_control. */ } static void free_block_list(struct rb_root *blocks) { struct tree_block *block; struct rb_node *rb_node; while ((rb_node = rb_first(blocks))) { block = rb_entry(rb_node, struct tree_block, rb_node); rb_erase(rb_node, blocks); kfree(block); } } static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *reloc_root) { struct btrfs_fs_info *fs_info = reloc_root->fs_info; struct btrfs_root *root; int ret; if (reloc_root->last_trans == trans->transid) return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); /* * This should succeed, since we can't have a reloc root without having * already looked up the actual root and created the reloc root for this * root. * * However if there's some sort of corruption where we have a ref to a * reloc root without a corresponding root this could return ENOENT. */ if (IS_ERR(root)) { ASSERT(0); return PTR_ERR(root); } if (root->reloc_root != reloc_root) { ASSERT(0); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", reloc_root->root_key.offset); btrfs_put_root(root); return -EUCLEAN; } ret = btrfs_record_root_in_trans(trans, root); btrfs_put_root(root); return ret; } static noinline_for_stack struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_backref_edge *edges[]) { struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; int ret; next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; /* * If there is no root, then our references for this block are * incomplete, as we should be able to walk all the way up to a * block that is owned by a root. * * This path is only for SHAREABLE roots, so if we come upon a * non-SHAREABLE root then we have backrefs that resolve * improperly. * * Both of these cases indicate file system corruption, or a bug * in the backref walking code. */ if (!root) { ASSERT(0); btrfs_err(trans->fs_info, "bytenr %llu doesn't have a backref path ending in a root", node->bytenr); return ERR_PTR(-EUCLEAN); } if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ASSERT(0); btrfs_err(trans->fs_info, "bytenr %llu has multiple refs with one ending in a non-shareable root", node->bytenr); return ERR_PTR(-EUCLEAN); } if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); break; } ret = btrfs_record_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); root = root->reloc_root; /* * We could have raced with another thread which failed, so * root->reloc_root may not be set, return ENOENT in this case. */ if (!root) return ERR_PTR(-ENOENT); if (next->new_bytenr != root->node->start) { /* * We just created the reloc root, so we shouldn't have * ->new_bytenr set and this shouldn't be in the changed * list. If it is then we have multiple roots pointing * at the same bytenr which indicates corruption, or * we've made a mistake in the backref walking code. */ ASSERT(next->new_bytenr == 0); ASSERT(list_empty(&next->list)); if (next->new_bytenr || !list_empty(&next->list)) { btrfs_err(trans->fs_info, "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", node->bytenr, next->bytenr); return ERR_PTR(-EUCLEAN); } next->new_bytenr = root->node->start; btrfs_put_root(next->root); next->root = btrfs_grab_root(root); ASSERT(next->root); list_add_tail(&next->list, &rc->backref_cache.changed); mark_block_processed(rc, next); break; } WARN_ON(1); root = NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } if (!root) { /* * This can happen if there's fs corruption or if there's a bug * in the backref lookup code. */ ASSERT(0); return ERR_PTR(-ENOENT); } next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { rc->backref_cache.path[next->level] = next; if (--index < 0) break; next = edges[index]->node[UPPER]; } return root; } /* * Select a tree root for relocation. * * Return NULL if the block is not shareable. We should use do_relocation() in * this case. * * Return a tree root pointer if the block is shareable. * Return -ENOENT if the block is root of reloc tree. */ static noinline_for_stack struct btrfs_root *select_one_root(struct btrfs_backref_node *node) { struct btrfs_backref_node *next; struct btrfs_root *root; struct btrfs_root *fs_root = NULL; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; int index = 0; next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; /* * This can occur if we have incomplete extent refs leading all * the way up a particular path, in this case return -EUCLEAN. */ if (!root) return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return root; if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) fs_root = root; if (next != node) return NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } if (!fs_root) return ERR_PTR(-ENOENT); return fs_root; } static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, struct btrfs_backref_node *node, int reserve) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *next = node; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; u64 num_bytes = 0; int index = 0; BUG_ON(reserve && node->processed); while (next) { cond_resched(); while (1) { if (next->processed && (reserve || next != node)) break; num_bytes += fs_info->nodesize; if (list_empty(&next->upper)) break; edge = list_entry(next->upper.next, struct btrfs_backref_edge, list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } next = walk_down_backref(edges, &index); } return num_bytes; } static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node) { struct btrfs_root *root = rc->extent_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 num_bytes; int ret; u64 tmp; num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; /* * We are under a transaction here so we can only do limited flushing. * If we get an enospc just kick back -EAGAIN so we know to drop the * transaction and try to refill when we can flush all the things. */ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; while (tmp <= rc->reserved_bytes) tmp <<= 1; /* * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + fs_info->nodesize * RELOCATION_RESERVED_NODES; return -EAGAIN; } return 0; } /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. * * if called by link_to_upper, the block has already been relocated. * in that case this function just updates pointers. */ static int do_relocation(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) { struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; struct btrfs_root *root; struct extent_buffer *eb; u32 blocksize; u64 bytenr; int slot; int ret = 0; /* * If we are lowest then this is the first time we're processing this * block, and thus shouldn't have an eb associated with it yet. */ ASSERT(!lowest || !node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { struct btrfs_ref ref = { 0 }; cond_resched(); upper = edge->node[UPPER]; root = select_reloc_root(trans, rc, upper, edges); if (IS_ERR(root)) { ret = PTR_ERR(root); goto next; } if (upper->eb && !upper->locked) { if (!lowest) { ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); bytenr = btrfs_node_blockptr(upper->eb, slot); if (node->eb->start == bytenr) goto next; } btrfs_backref_drop_node_buffer(upper); } if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret) { if (ret > 0) ret = -ENOENT; btrfs_release_path(path); break; } if (!upper->eb) { upper->eb = path->nodes[upper->level]; path->nodes[upper->level] = NULL; } else { BUG_ON(upper->eb != path->nodes[upper->level]); } upper->locked = 1; path->locks[upper->level] = 0; slot = path->slots[upper->level]; btrfs_release_path(path); } else { ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); } bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { if (bytenr != node->bytenr) { btrfs_err(root->fs_info, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, upper->eb->start); ret = -EIO; goto next; } } else { if (node->eb->start == bytenr) goto next; } blocksize = root->fs_info->nodesize; eb = btrfs_read_node_slot(upper->eb, slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto next; } btrfs_tree_lock(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) goto next; /* * We've just COWed this block, it should have updated * the correct backref node entry. */ ASSERT(node->eb == eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); btrfs_mark_buffer_dirty(trans, upper->eb); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb)); btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb), root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, upper->eb); if (ret) btrfs_abort_transaction(trans, ret); } next: if (!upper->pending) btrfs_backref_drop_node_buffer(upper); else btrfs_backref_unlock_node_buffer(upper); if (ret) break; } if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); list_move_tail(&node->list, &rc->backref_cache.changed); node->pending = 0; } path->lowest_level = 0; /* * We should have allocated all of our space in the block rsv and thus * shouldn't ENOSPC. */ ASSERT(ret != -ENOSPC); return ret; } static int link_to_upper(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_path *path) { struct btrfs_key key; btrfs_node_key_to_cpu(node->eb, &key, 0); return do_relocation(trans, rc, node, &key, path, 0); } static int finish_pending_nodes(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_path *path, int err) { LIST_HEAD(list); struct btrfs_backref_cache *cache = &rc->backref_cache; struct btrfs_backref_node *node; int level; int ret; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); if (!err) { ret = link_to_upper(trans, rc, node, path); if (ret < 0) err = ret; } } list_splice_init(&list, &cache->pending[level]); } return err; } /* * mark a block and all blocks directly/indirectly reference the block * as processed. */ static void update_processed_blocks(struct reloc_control *rc, struct btrfs_backref_node *node) { struct btrfs_backref_node *next = node; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; int index = 0; while (next) { cond_resched(); while (1) { if (next->processed) break; mark_block_processed(rc, next); if (list_empty(&next->upper)) break; edge = list_entry(next->upper.next, struct btrfs_backref_edge, list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } next = walk_down_backref(edges, &index); } } static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { struct btrfs_tree_parent_check check = { .level = block->level, .owner_root = block->owner, .transid = block->key.offset }; struct extent_buffer *eb; eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); else btrfs_node_key_to_cpu(eb, &block->key, 0); free_extent_buffer(eb); block->key_ready = true; return 0; } /* * helper function to relocate a tree block */ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_backref_node *node, struct btrfs_key *key, struct btrfs_path *path) { struct btrfs_root *root; int ret = 0; if (!node) return 0; /* * If we fail here we want to drop our backref_node because we are going * to start over and regenerate the tree for it. */ ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; BUG_ON(node->processed); root = select_one_root(node); if (IS_ERR(root)) { ret = PTR_ERR(root); /* See explanation in select_one_root for the -EUCLEAN case. */ ASSERT(ret == -ENOENT); if (ret == -ENOENT) { ret = 0; update_processed_blocks(rc, node); } goto out; } if (root) { if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it * should not have had the ->new_bytenr modified and * should have not been included on the changed list. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, * and thus we would trip over these checks. ASSERT() * for the developer case, because it could indicate a * bug in the backref code, however error out for a * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); ASSERT(list_empty(&node->list)); if (node->new_bytenr || !list_empty(&node->list)) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); ret = -EUCLEAN; goto out; } ret = btrfs_record_root_in_trans(trans, root); if (ret) goto out; /* * Another thread could have failed, need to check if we * have reloc_root actually set. */ if (!root->reloc_root) { ret = -ENOENT; goto out; } root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; if (root == root->fs_info->chunk_root) btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, key, path, 0, 1); btrfs_release_path(path); if (root == root->fs_info->chunk_root) btrfs_trans_release_chunk_metadata(trans); if (ret > 0) ret = 0; } if (!ret) update_processed_blocks(rc, node); } else { ret = do_relocation(trans, rc, node, key, path, 1); } out: if (ret || node->level == 0 || node->cowonly) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } /* * relocate a list of blocks */ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *node; struct btrfs_path *path; struct tree_block *block; struct tree_block *next; int ret; int err = 0; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out_free_blocks; } /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) btrfs_readahead_tree_block(fs_info, block->bytenr, block->owner, 0, block->level); } /* Get first keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { err = get_tree_block_key(fs_info, block); if (err) goto out_free_path; } } /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); goto out; } ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { err = ret; break; } } out: err = finish_pending_nodes(trans, rc, path, err); out_free_path: btrfs_free_path(path); out_free_blocks: free_block_list(blocks); return err; } static noinline_for_stack int prealloc_file_extent_cluster( struct btrfs_inode *inode, const struct file_extent_cluster *cluster) { u64 alloc_hint = 0; u64 start; u64 end; u64 offset = inode->index_cnt; u64 num_bytes; int nr; int ret = 0; u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; /* * For subpage case, previous i_size may not be aligned to PAGE_SIZE. * This means the range [i_size, PAGE_END + 1) is filled with zeros by * btrfs_do_readpage() call of previously relocated file cluster. * * If the current cluster starts in the above range, btrfs_do_readpage() * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct folio *folio; ASSERT(sectorsize < PAGE_SIZE); ASSERT(IS_ALIGNED(i_size, sectorsize)); /* * Subpage can't handle page with DIRTY but without UPTODATE * bit as it can lead to the following deadlock: * * btrfs_read_folio() * | Page already *locked* * |- btrfs_lock_and_flush_ordered_range() * |- btrfs_start_ordered_extent() * |- extent_write_cache_pages() * |- lock_page() * We try to lock the page we already hold. * * Here we just writeback the whole data reloc inode, so that * we will be ensured to have no dirty range in the page, and * are safe to clear the uptodate bits. * * This shouldn't cause too much overhead, as we need to write * the data back anyway. */ ret = filemap_write_and_wait(mapping); if (ret < 0) return ret; clear_extent_bits(&inode->io_tree, i_size, round_up(i_size, PAGE_SIZE) - 1, EXTENT_UPTODATE); folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we * will re-read the whole page anyway. */ if (!IS_ERR(folio)) { btrfs_subpage_clear_uptodate(fs_info, folio, i_size, round_up(i_size, PAGE_SIZE) - i_size); folio_unlock(folio); folio_put(folio); } } BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, prealloc_end + 1 - prealloc_start); if (ret) return ret; btrfs_inode_lock(inode, 0); for (nr = 0; nr < cluster->nr; nr++) { struct extent_state *cached_state = NULL; start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, prealloc_end + 1 - cur_offset); return ret; } static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { struct extent_map *em; struct extent_state *cached_state = NULL; int ret = 0; em = alloc_extent_map(); if (!em) return -ENOMEM; em->start = start; em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; em->flags |= EXTENT_FLAG_PINNED; lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); free_extent_map(em); return ret; } /* * Allow error injection to test balance/relocation cancellation */ noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || atomic_read(&fs_info->reloc_cancel_req) || fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, int cluster_nr) { /* Last extent, use cluster end directly */ if (cluster_nr >= cluster->nr - 1) return cluster->end; /* Use next boundary start*/ return cluster->boundary[cluster_nr + 1] - 1; } static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra, const struct file_extent_cluster *cluster, int *cluster_nr, unsigned long index) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 offset = BTRFS_I(inode)->index_cnt; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); struct folio *folio; u64 folio_start; u64 folio_end; u64 cur; int ret; ASSERT(index <= last_index); folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(inode->i_mapping, ra, NULL, index, last_index + 1 - index); folio = __filemap_get_folio(inode->i_mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) return PTR_ERR(folio); } WARN_ON(folio_order(folio)); if (folio_test_readahead(folio)) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, index, last_index + 1 - index); if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); if (!folio_test_uptodate(folio)) { ret = -EIO; goto release_folio; } } /* * We could have lost folio private when we dropped the lock to read the * folio above, make sure we set_page_extent_mapped here so we have any * of the subpage blocksize stuff we need in place. */ ret = set_folio_extent_mapped(folio); if (ret < 0) goto release_folio; folio_start = folio_pos(folio); folio_end = folio_start + PAGE_SIZE - 1; /* * Start from the cluster, as for subpage case, the cluster can start * inside the folio. */ cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); while (cur <= folio_end) { struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; u64 clamped_start = max(folio_start, extent_start); u64 clamped_end = min(folio_end, extent_end); u32 clamped_len = clamped_end + 1 - clamped_start; /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), clamped_len, clamped_len, false); if (ret) goto release_folio; /* Mark the range delalloc and dirty for later writeback */ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, &cached_state); if (ret) { clear_extent_bit(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, EXTENT_LOCKED | EXTENT_BOUNDARY, &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); goto release_folio; } btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); /* * Set the boundary if it's inside the folio. * Data relocation requires the destination extents to have the * same size as the source. * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + fs_info->sectorsize - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, boundary_start, boundary_end, EXTENT_BOUNDARY, NULL); } unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; /* Crossed extent end, go to next extent */ if (cur >= extent_end) { (*cluster_nr)++; /* Just finished the last extent of the cluster, exit. */ if (*cluster_nr >= cluster->nr) break; } } folio_unlock(folio); folio_put(folio); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); if (btrfs_should_cancel_balance(fs_info)) ret = -ECANCELED; return ret; release_folio: folio_unlock(folio); folio_put(folio); return ret; } static int relocate_file_extent_cluster(struct inode *inode, const struct file_extent_cluster *cluster) { u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; struct file_ra_state *ra; int cluster_nr = 0; int ret = 0; if (!cluster->nr) return 0; ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) return -ENOMEM; ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); if (ret) goto out; file_ra_state_init(ra, inode->i_mapping); ret = setup_relocation_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) goto out; last_index = (cluster->end - offset) >> PAGE_SHIFT; for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: kfree(ra); return ret; } static noinline_for_stack int relocate_data_extent(struct inode *inode, const struct btrfs_key *extent_key, struct file_extent_cluster *cluster) { int ret; struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ret = relocate_file_extent_cluster(inode, cluster); if (ret) return ret; cluster->nr = 0; } /* * Under simple quotas, we set root->relocation_src_root when we find * the extent. If adjacent extents have different owners, we can't merge * them while relocating. Handle this by storing the owning root that * started a cluster and if we see an extent from a different root break * cluster formation (just like the above case of non-adjacent extents). * * Without simple quotas, relocation_src_root is always 0, so we should * never see a mismatch, and it should have no effect on relocation * clusters. */ if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { u64 tmp = root->relocation_src_root; /* * root->relocation_src_root is the state that actually affects * the preallocation we do here, so set it to the root owning * the cluster we need to relocate. */ root->relocation_src_root = cluster->owning_root; ret = relocate_file_extent_cluster(inode, cluster); if (ret) return ret; cluster->nr = 0; /* And reset it back for the current extent's owning root. */ root->relocation_src_root = tmp; } if (!cluster->nr) { cluster->start = extent_key->objectid; cluster->owning_root = root->relocation_src_root; } else BUG_ON(cluster->nr >= MAX_EXTENTS); cluster->end = extent_key->objectid + extent_key->offset - 1; cluster->boundary[cluster->nr] = extent_key->objectid; cluster->nr++; if (cluster->nr >= MAX_EXTENTS) { ret = relocate_file_extent_cluster(inode, cluster); if (ret) return ret; cluster->nr = 0; } return 0; } /* * helper to add a tree block to the list. * the major work is getting the generation and level of the block */ static int add_tree_block(struct reloc_control *rc, const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_tree_block_info *bi; struct tree_block *block; struct rb_node *rb_node; u32 item_size; int level = -1; u64 generation; u64 owner = 0; eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { unsigned long ptr = 0, end; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); end = (unsigned long)ei + item_size; if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); level = btrfs_tree_block_level(eb, bi); ptr = (unsigned long)(bi + 1); } else { level = (int)extent_key->offset; ptr = (unsigned long)(ei + 1); } generation = btrfs_extent_generation(eb, ei); /* * We're reading random blocks without knowing their owner ahead * of time. This is ok most of the time, as all reloc roots and * fs roots have the same lock type. However normal trees do * not, and the only way to know ahead of time is to read the * inline ref offset. We know it's an fs root if * * 1. There's more than one ref. * 2. There's a SHARED_DATA_REF_KEY set. * 3. FULL_BACKREF is set on the flags. * * Otherwise it's safe to assume that the ref offset == the * owner of this block, so we can use that when calling * read_tree_block. */ if (btrfs_extent_refs(eb, ei) == 1 && !(btrfs_extent_flags(eb, ei) & BTRFS_BLOCK_FLAG_FULL_BACKREF) && ptr < end) { struct btrfs_extent_inline_ref *iref; int type; iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); if (type == BTRFS_REF_TYPE_INVALID) return -EINVAL; if (type == BTRFS_TREE_BLOCK_REF_KEY) owner = btrfs_extent_inline_ref_offset(eb, iref); } } else { btrfs_print_leaf(eb); btrfs_err(rc->block_group->fs_info, "unrecognized tree backref at tree block %llu slot %u", eb->start, path->slots[0]); btrfs_release_path(path); return -EUCLEAN; } btrfs_release_path(path); BUG_ON(level == -1); block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) return -ENOMEM; block->bytenr = extent_key->objectid; block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; block->key_ready = false; block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, -EEXIST); return 0; } /* * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY */ static int __add_tree_block(struct reloc_control *rc, u64 bytenr, u32 blocksize, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; int ret; bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); if (tree_block_processed(bytenr, rc)) return 0; if (rb_simple_search(blocks, bytenr)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: key.objectid = bytenr; if (skinny) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = (u64)-1; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = blocksize; } path->search_commit_root = 1; path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0 && skinny) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && (key.type == BTRFS_METADATA_ITEM_KEY || (key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == blocksize))) ret = 0; } if (ret) { skinny = false; btrfs_release_path(path); goto again; } } if (ret) { ASSERT(ret == 1); btrfs_print_leaf(path->nodes[0]); btrfs_err(fs_info, "tree block extent item (%llu) is not found in extent tree", bytenr); WARN_ON(1); ret = -EINVAL; goto out; } ret = add_tree_block(rc, &key, path, blocks); out: btrfs_free_path(path); return ret; } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int ret = 0; if (inode) goto truncate; inode = btrfs_iget(fs_info->sb, ino, root); if (IS_ERR(inode)) return -ENOENT; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); if (ret) goto out; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } ret = btrfs_truncate_free_space_cache(trans, block_group, inode); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: iput(inode); return ret; } /* * Locate the free space cache EXTENT_DATA in root tree leaf and delete the * cache inode, to avoid free space cache data extent blocking data relocation. */ static int delete_v1_space_cache(struct extent_buffer *leaf, struct btrfs_block_group *block_group, u64 data_bytenr) { u64 space_cache_ino; struct btrfs_file_extent_item *ei; struct btrfs_key key; bool found = false; int i; int ret; if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) return 0; for (i = 0; i < btrfs_header_nritems(leaf); i++) { u8 type; btrfs_item_key_to_cpu(leaf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, ei); if ((type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) && btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { found = true; space_cache_ino = key.objectid; break; } } if (!found) return -ENOENT; ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, space_cache_ino); return ret; } /* * helper to find all tree blocks that reference a given data extent */ static noinline_for_stack int add_data_references(struct reloc_control *rc, const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; struct ulist_node *ref_node = NULL; const u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; btrfs_release_path(path); ctx.bytenr = extent_key->objectid; ctx.skip_inode_ref_list = true; ctx.fs_info = rc->extent_root->fs_info; ret = btrfs_find_all_leafs(&ctx); if (ret < 0) return ret; ULIST_ITER_INIT(&leaf_uiter); while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; eb = read_tree_block(ctx.fs_info, ref_node->val, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; } ret = delete_v1_space_cache(eb, rc->block_group, extent_key->objectid); free_extent_buffer(eb); if (ret < 0) break; ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); if (ret < 0) break; } if (ret < 0) free_block_list(blocks); ulist_free(ctx.refs); return ret; } /* * helper to find next unprocessed extent */ static noinline_for_stack int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, struct btrfs_key *extent_key) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; u64 start, end, last; int ret; last = rc->block_group->start + rc->block_group->length; while (1) { bool block_found; cond_resched(); if (rc->search_start >= last) { ret = 1; break; } key.objectid = rc->search_start; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; path->search_commit_root = 1; path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) break; next: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(rc->extent_root, path); if (ret != 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid >= last) { ret = 1; break; } if (key.type != BTRFS_EXTENT_ITEM_KEY && key.type != BTRFS_METADATA_ITEM_KEY) { path->slots[0]++; goto next; } if (key.type == BTRFS_EXTENT_ITEM_KEY && key.objectid + key.offset <= rc->search_start) { path->slots[0]++; goto next; } if (key.type == BTRFS_METADATA_ITEM_KEY && key.objectid + fs_info->nodesize <= rc->search_start) { path->slots[0]++; goto next; } block_found = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, EXTENT_DIRTY, NULL); if (block_found && start <= key.objectid) { btrfs_release_path(path); rc->search_start = end + 1; } else { if (key.type == BTRFS_EXTENT_ITEM_KEY) rc->search_start = key.objectid + key.offset; else rc->search_start = key.objectid + fs_info->nodesize; memcpy(extent_key, &key, sizeof(key)); return 0; } } btrfs_release_path(path); return ret; } static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; mutex_unlock(&fs_info->reloc_mutex); } static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->start; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->fs_info->nodesize * RELOCATION_RESERVED_NODES; ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) return ret; rc->create_reloc_tree = true; set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { unset_reloc_control(rc); /* * extent tree is not a ref_cow tree and has no reloc_root to * cleanup. And callers are responsible to free the above * block rsv. */ return PTR_ERR(trans); } ret = btrfs_commit_transaction(trans); if (ret) unset_reloc_control(rc); return ret; } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; u64 flags; int ret; int err = 0; int progress = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { err = ret; goto out_free; } while (1) { rc->reserved_bytes = 0; ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) { err = ret; break; } progress++; trans = btrfs_start_transaction(rc->extent_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; break; } restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans); trans = NULL; continue; } ret = find_next_extent(rc, path, &key); if (ret < 0) err = ret; if (ret != 0) break; rc->extents_found++; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(path->nodes[0], ei); /* * If we are relocating a simple quota owned extent item, we * need to note the owner on the reloc data root so that when * we allocate the replacement item, we can attribute it to the * correct eventual owner (rather than the reloc data root). */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, path->nodes[0], path->slots[0]); root->relocation_src_root = owning_root_id; } if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && (flags & BTRFS_EXTENT_FLAG_DATA)) { ret = add_data_references(rc, &key, path, &blocks); } else { btrfs_release_path(path); ret = 0; } if (ret < 0) { err = ret; break; } if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { if (ret != -EAGAIN) { err = ret; break; } rc->extents_found--; rc->search_start = key.objectid; } } btrfs_end_transaction_throttle(trans); btrfs_btree_balance_dirty(fs_info); trans = NULL; if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = true; ret = relocate_data_extent(rc->data_inode, &key, &rc->cluster); if (ret < 0) { err = ret; break; } } if (btrfs_should_cancel_balance(fs_info)) { err = -ECANCELED; break; } } if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); if (ret == 1) { err = 0; progress = 0; goto restart; } } btrfs_release_path(path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans); btrfs_btree_balance_dirty(fs_info); } if (!err) { ret = relocate_file_extent_cluster(rc->data_inode, &rc->cluster); if (ret < 0) err = ret; } rc->create_reloc_tree = false; set_reloc_control(rc); btrfs_backref_release_cache(&rc->backref_cache); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* * Even in the case when the relocation is cancelled, we should all go * through prepare_to_merge() and merge_reloc_roots(). * * For error (including cancelled balance), prepare_to_merge() will * mark all reloc trees orphan, then queue them for cleanup in * merge_reloc_roots() */ err = prepare_to_merge(rc, err); merge_reloc_roots(rc); rc->merge_reloc_tree = false; unset_reloc_control(rc); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; } ret = btrfs_commit_transaction(trans); if (ret && !err) err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); return err; } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) goto out; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; } static void delete_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_key key; int ret = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret) { if (ret > 0) ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); out: if (ret) btrfs_abort_transaction(trans, ret); btrfs_free_path(path); } /* * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; int err = 0; root = btrfs_grab_root(fs_info->data_reloc_root); trans = btrfs_start_transaction(root, 6); if (IS_ERR(trans)) { btrfs_put_root(root); return ERR_CAST(trans); } err = btrfs_get_free_objectid(root, &objectid); if (err) goto out; err = __insert_orphan_inode(trans, root, objectid); if (err) goto out; inode = btrfs_iget(fs_info->sb, objectid, root); if (IS_ERR(inode)) { delete_orphan_inode(trans, root, objectid); err = PTR_ERR(inode); inode = NULL; goto out; } BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { iput(inode); inode = ERR_PTR(err); } return inode; } /* * Mark start of chunk relocation that is cancellable. Check if the cancellation * has been requested meanwhile and don't start in that case. * * Return: * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); /* * On cancel, clear all requests but let the caller mark * the end after cleanup operations. */ atomic_set(&fs_info->reloc_cancel_req, 0); return -ECANCELED; } return 0; } /* * Mark end of chunk relocation that is cancellable and wake any waiters. */ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) { /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); atomic_set(&fs_info->reloc_cancel_req, 0); } static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; rc = kzalloc(sizeof(*rc), GFP_NOFS); if (!rc) return NULL; INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); rc->reloc_root_tree.rb_root = RB_ROOT; spin_lock_init(&rc->reloc_root_tree.lock); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } static void free_reloc_control(struct reloc_control *rc) { struct mapping_node *node, *tmp; free_reloc_roots(&rc->reloc_roots); rbtree_postorder_for_each_entry_safe(node, tmp, &rc->reloc_root_tree.rb_root, rb_node) kfree(node); kfree(rc); } /* * Print the block group being relocated */ static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group *block_group) { char buf[128] = {'\0'}; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", block_group->start, buf); } static const char *stage_to_string(enum reloc_stage stage) { if (stage == MOVE_DATA_EXTENTS) return "move data extents"; if (stage == UPDATE_DATA_PTRS) return "update data pointers"; return "unknown"; } /* * function to relocate all extents in a block group. */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { struct btrfs_block_group *bg; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; struct btrfs_path *path; int ret; int rw = 0; int err = 0; /* * This only gets set if we had a half-deleted snapshot on mount. We * cannot allow relocation to start while we're still trying to clean up * these pending deletions. */ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); if (ret) return ret; /* We may have been woken up by close_ctree, so bail if we're closing. */ if (btrfs_fs_closing(fs_info)) return -EINTR; bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; /* * Relocation of a data block group creates ordered extents. Without * sb_start_write(), we can freeze the filesystem while unfinished * ordered extents are left. Such ordered extents can cause a deadlock * e.g. when syncfs() is waiting for their completion but they can't * finish because they block when joining a transaction, due to the * fact that the freeze locks are being held in write mode. */ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) ASSERT(sb_write_started(fs_info->sb)); if (btrfs_pinned_by_swapfile(fs_info, bg)) { btrfs_put_block_group(bg); return -ETXTBSY; } rc = alloc_reloc_control(fs_info); if (!rc) { btrfs_put_block_group(bg); return -ENOMEM; } ret = reloc_chunk_start(fs_info); if (ret < 0) { err = ret; goto out_put_bg; } rc->extent_root = extent_root; rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group, true); if (ret) { err = ret; goto out; } rw = 1; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out; } inode = lookup_free_space_inode(rc->block_group, path); btrfs_free_path(path); if (!IS_ERR(inode)) ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); else ret = PTR_ERR(inode); if (ret && ret != -ENOENT) { err = ret; goto out; } rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; goto out; } describe_relocation(fs_info, rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group->start, rc->block_group->length); ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); while (1) { enum reloc_stage finishes_stage; mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) err = ret; finishes_stage = rc->stage; /* * We may have gotten ENOSPC after we already dirtied some * extents. If writeout happens while we're relocating a * different block group we could end up hitting the * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in * btrfs_reloc_cow_block. Make sure we write everything out * properly so we don't trip over this problem, and then break * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); if (ret) err = ret; invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } if (err < 0) goto out; if (rc->extents_found == 0) break; btrfs_info(fs_info, "found %llu extents, stage: %s", rc->extents_found, stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); WARN_ON(rc->block_group->used > 0); out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); out_put_bg: btrfs_put_block_group(bg); reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret, err; trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); btrfs_set_root_drop_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); err = btrfs_end_transaction(trans); if (err) return err; return ret; } /* * recover relocation interrupted by system crash. * * this function resumes merging reloc trees with corresponding fs trees. * this is important for keeping the sharing of tree blocks */ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) { LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; struct btrfs_root *reloc_root; struct btrfs_path *path; struct extent_buffer *leaf; struct reloc_control *rc = NULL; struct btrfs_trans_handle *trans; int ret; int err = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { err = ret; goto out; } if (ret > 0) { if (path->slots[0] == 0) break; path->slots[0]--; } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || key.type != BTRFS_ROOT_ITEM_KEY) break; reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); if (ret != -ENOENT) { err = ret; goto out; } ret = mark_garbage_root(reloc_root); if (ret < 0) { err = ret; goto out; } } else { btrfs_put_root(fs_root); } } if (key.offset == 0) break; key.offset--; } btrfs_release_path(path); if (list_empty(&reloc_roots)) goto out; rc = alloc_reloc_control(fs_info); if (!rc) { err = -ENOMEM; goto out; } ret = reloc_chunk_start(fs_info); if (ret < 0) { err = ret; goto out_end; } rc->extent_root = btrfs_extent_root(fs_info, 0); set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_unset; } rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); list_del(&reloc_root->root_list); if (btrfs_root_refs(&reloc_root->root_item) == 0) { list_add_tail(&reloc_root->root_list, &rc->reloc_roots); continue; } fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_end_transaction(trans); goto out_unset; } err = __add_reloc_root(reloc_root); ASSERT(err != -EEXIST); if (err) { list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_put_root(fs_root); btrfs_end_transaction(trans); goto out_unset; } fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } err = btrfs_commit_transaction(trans); if (err) goto out_unset; merge_reloc_roots(rc); unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_clean; } err = btrfs_commit_transaction(trans); out_clean: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; out_unset: unset_reloc_control(rc); out_end: reloc_chunk_end(fs_info); free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); btrfs_free_path(path); if (err == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = btrfs_grab_root(fs_info->data_reloc_root); ASSERT(fs_root); err = btrfs_orphan_cleanup(fs_root); btrfs_put_root(fs_root); } return err; } /* * helper to add ordered checksum for data relocation. * * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 disk_bytenr = ordered->file_offset + inode->index_cnt; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); int ret; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + ordered->num_bytes - 1, &list, 0, false); if (ret) return ret; while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); /* * We need to offset the new_bytenr based on where the csum is. * We need to do this because we will read in entire prealloc * extents but we may have written to say the middle of the * prealloc extent, so we need to make sure the csum goes with * the right disk offset. * * We can do this because the data reloc inode refers strictly * to the on disk bytes, so we don't have to worry about * disk_len vs real len like with real inodes since it's all * disk length. */ sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; btrfs_add_ordered_sum(ordered, sums); } return 0; } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct extent_buffer *buf, struct extent_buffer *cow) { struct btrfs_fs_info *fs_info = root->fs_info; struct reloc_control *rc; struct btrfs_backref_node *node; int first_cow = 0; int level; int ret = 0; rc = fs_info->reloc_ctl; if (!rc) return 0; BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) first_cow = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; BUG_ON(node->bytenr != buf->start && node->new_bytenr != buf->start); btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; if (!node->pending) { list_move_tail(&node->list, &rc->backref_cache.pending[level]); node->pending = 1; } if (first_cow) mark_block_processed(rc, node); if (first_cow && level > 0) rc->nodes_relocated += buf->len; } if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) ret = replace_file_extents(trans, rc, root, cow); return ret; } /* * called before creating snapshot. it calculates metadata reservation * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { struct btrfs_root *root = pending->root; struct reloc_control *rc = root->fs_info->reloc_ctl; if (!rc || !have_reloc_root(root)) return; if (!rc->merge_reloc_tree) return; root = root->reloc_root; BUG_ON(btrfs_root_refs(&root->root_item) == 0); /* * relocation is in the stage of merging trees. the space * used by merging a reloc tree is twice the size of * relocated tree nodes in the worst case. half for cowing * the reloc tree, half for cowing the fs tree. the space * used by cowing the reloc tree will be freed after the * tree is dropped. if we create snapshot, cowing the fs * tree may use more space than it frees. so we need * reserve extra space. */ *bytes_to_reserve += rc->nodes_relocated; } /* * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot * * This is similar to btrfs_init_reloc_root(), we come out of here with two * references held on the reloc_root, one for root->reloc_root and one for * rc->reloc_roots. */ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_root *root = pending->root; struct btrfs_root *reloc_root; struct btrfs_root *new_root; struct reloc_control *rc = root->fs_info->reloc_ctl; int ret; if (!rc || !have_reloc_root(root)) return 0; rc = root->fs_info->reloc_ctl; rc->merging_rsv_size += rc->nodes_relocated; if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, rc->nodes_relocated, true); if (ret) return ret; } new_root = pending->snap; reloc_root = create_reloc_root(trans, root->reloc_root, new_root->root_key.objectid); if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); ASSERT(ret != -EEXIST); if (ret) { /* Pairs with create_reloc_root */ btrfs_put_root(reloc_root); return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); return ret; } /* * Get the current bytenr for the block group which is being relocated. * * Return U64_MAX if no running relocation. */ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) { u64 logical = U64_MAX; lockdep_assert_held(&fs_info->reloc_mutex); if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) logical = fs_info->reloc_ctl->block_group->start; return logical; }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 /* * linux/fs/nls/nls_cp852.c * * Charset cp852 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7, 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106, /* 0x90*/ 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d, /* 0xa0*/ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e, 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a, 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, /* 0xd0*/ 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce, 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580, /* 0xe0*/ 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161, 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4, /* 0xf0*/ 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xf5, /* 0xa0-0xa7 */ 0xf9, 0x00, 0x00, 0xae, 0xaa, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ 0xf8, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0xf7, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0xb5, 0xb6, 0x00, 0x8e, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ 0x00, 0x90, 0x00, 0xd3, 0x00, 0xd6, 0xd7, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xe0, 0xe2, 0x00, 0x99, 0x9e, /* 0xd0-0xd7 */ 0x00, 0x00, 0xe9, 0x00, 0x9a, 0xed, 0x00, 0xe1, /* 0xd8-0xdf */ 0x00, 0xa0, 0x83, 0x00, 0x84, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ 0x00, 0x82, 0x00, 0x89, 0x00, 0xa1, 0x8c, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ 0x00, 0x00, 0xa3, 0x00, 0x81, 0xec, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0xc6, 0xc7, 0xa4, 0xa5, 0x8f, 0x86, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xac, 0x9f, 0xd2, 0xd4, /* 0x08-0x0f */ 0xd1, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xa8, 0xa9, 0xb7, 0xd8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x00, /* 0x38-0x3f */ 0x00, 0x9d, 0x88, 0xe3, 0xe4, 0x00, 0x00, 0xd5, /* 0x40-0x47 */ 0xe5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x8a, 0x8b, 0x00, 0x00, 0xe8, 0xea, 0x00, 0x00, /* 0x50-0x57 */ 0xfc, 0xfd, 0x97, 0x98, 0x00, 0x00, 0xb8, 0xad, /* 0x58-0x5f */ 0xe6, 0xe7, 0xdd, 0xee, 0x9b, 0x9c, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x85, /* 0x68-0x6f */ 0xeb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x8d, 0xab, 0xbd, 0xbe, 0xa6, 0xa7, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf3, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xf4, 0xfa, 0x00, 0xf2, 0x00, 0xf1, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xab, 0x84, 0x86, /* 0x88-0x8f */ 0x82, 0x92, 0x92, 0x93, 0x94, 0x96, 0x96, 0x98, /* 0x90-0x97 */ 0x98, 0x94, 0x81, 0x9c, 0x9c, 0x88, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */ 0xa9, 0xa9, 0xaa, 0xab, 0x9f, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0xd8, /* 0xb0-0xb7 */ 0xad, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd0, 0xd4, 0x89, 0xd4, 0xe5, 0xa1, 0x8c, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xee, 0x85, 0xdf, /* 0xd8-0xdf */ 0xa2, 0xe1, 0x93, 0xe4, 0xe4, 0xe5, 0xe7, 0xe7, /* 0xe0-0xe7 */ 0xea, 0xa3, 0xea, 0xfb, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xde, 0x8f, 0x80, /* 0x80-0x87 */ 0x9d, 0xd3, 0x8a, 0x8a, 0xd7, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x91, 0xe2, 0x99, 0x95, 0x95, 0x97, /* 0x90-0x97 */ 0x97, 0x99, 0x9a, 0x9b, 0x9b, 0x9d, 0x9e, 0xac, /* 0x98-0x9f */ 0xb5, 0xd6, 0xe0, 0xe9, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */ 0xa8, 0xa8, 0xaa, 0x8d, 0xac, 0xb8, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd1, 0xd1, 0xd2, 0xd3, 0xd2, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xb7, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe3, 0xd5, 0xe6, 0xe6, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xe8, 0xeb, 0xed, 0xed, 0xdd, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xeb, 0xfc, 0xfc, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp852", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp852(void) { return register_nls(&table); } static void __exit exit_nls_cp852(void) { unregister_nls(&table); } module_init(init_nls_cp852) module_exit(exit_nls_cp852) MODULE_LICENSE("Dual BSD/GPL");
95 5 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KBD_KERN_H #define _KBD_KERN_H #include <linux/tty.h> #include <linux/interrupt.h> #include <linux/keyboard.h> extern char *func_table[MAX_NR_FUNC]; /* * kbd->xxx contains the VC-local things (flag settings etc..) * * Note: externally visible are LED_SCR, LED_NUM, LED_CAP defined in kd.h * The code in KDGETLED / KDSETLED depends on the internal and * external order being the same. * * Note: lockstate is used as index in the array key_map. */ struct kbd_struct { unsigned char lockstate; /* 8 modifiers - the names do not have any meaning at all; they can be associated to arbitrarily chosen keys */ #define VC_SHIFTLOCK KG_SHIFT /* shift lock mode */ #define VC_ALTGRLOCK KG_ALTGR /* altgr lock mode */ #define VC_CTRLLOCK KG_CTRL /* control lock mode */ #define VC_ALTLOCK KG_ALT /* alt lock mode */ #define VC_SHIFTLLOCK KG_SHIFTL /* shiftl lock mode */ #define VC_SHIFTRLOCK KG_SHIFTR /* shiftr lock mode */ #define VC_CTRLLLOCK KG_CTRLL /* ctrll lock mode */ #define VC_CTRLRLOCK KG_CTRLR /* ctrlr lock mode */ unsigned char slockstate; /* for `sticky' Shift, Ctrl, etc. */ unsigned char ledmode:1; #define LED_SHOW_FLAGS 0 /* traditional state */ #define LED_SHOW_IOCTL 1 /* only change leds upon ioctl */ unsigned char ledflagstate:4; /* flags, not lights */ unsigned char default_ledflagstate:4; #define VC_SCROLLOCK 0 /* scroll-lock mode */ #define VC_NUMLOCK 1 /* numeric lock mode */ #define VC_CAPSLOCK 2 /* capslock mode */ #define VC_KANALOCK 3 /* kanalock mode */ unsigned char kbdmode:3; /* one 3-bit value */ #define VC_XLATE 0 /* translate keycodes using keymap */ #define VC_MEDIUMRAW 1 /* medium raw (keycode) mode */ #define VC_RAW 2 /* raw (scancode) mode */ #define VC_UNICODE 3 /* Unicode mode */ #define VC_OFF 4 /* disabled mode */ unsigned char modeflags:5; #define VC_APPLIC 0 /* application key mode */ #define VC_CKMODE 1 /* cursor key mode */ #define VC_REPEAT 2 /* keyboard repeat */ #define VC_CRLF 3 /* 0 - enter sends CR, 1 - enter sends CRLF */ #define VC_META 4 /* 0 - meta, 1 - meta=prefix with ESC */ }; extern int kbd_init(void); extern void setledstate(struct kbd_struct *kbd, unsigned int led); extern int do_poke_blanked_console; extern void (*kbd_ledfunc)(unsigned int led); extern int set_console(int nr); extern void schedule_console_callback(void); static inline int vc_kbd_mode(struct kbd_struct * kbd, int flag) { return ((kbd->modeflags >> flag) & 1); } static inline int vc_kbd_led(struct kbd_struct * kbd, int flag) { return ((kbd->ledflagstate >> flag) & 1); } static inline void set_vc_kbd_mode(struct kbd_struct * kbd, int flag) { kbd->modeflags |= 1 << flag; } static inline void set_vc_kbd_led(struct kbd_struct * kbd, int flag) { kbd->ledflagstate |= 1 << flag; } static inline void clr_vc_kbd_mode(struct kbd_struct * kbd, int flag) { kbd->modeflags &= ~(1 << flag); } static inline void clr_vc_kbd_led(struct kbd_struct * kbd, int flag) { kbd->ledflagstate &= ~(1 << flag); } static inline void chg_vc_kbd_lock(struct kbd_struct * kbd, int flag) { kbd->lockstate ^= 1 << flag; } static inline void chg_vc_kbd_slock(struct kbd_struct * kbd, int flag) { kbd->slockstate ^= 1 << flag; } static inline void chg_vc_kbd_mode(struct kbd_struct * kbd, int flag) { kbd->modeflags ^= 1 << flag; } static inline void chg_vc_kbd_led(struct kbd_struct * kbd, int flag) { kbd->ledflagstate ^= 1 << flag; } #define U(x) ((x) ^ 0xf000) #define BRL_UC_ROW 0x2800 /* keyboard.c */ struct console; void vt_set_leds_compute_shiftstate(void); /* defkeymap.c */ extern unsigned int keymap_count; #endif
51 51 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 // SPDX-License-Identifier: GPL-2.0-or-later /* * Intel SMP support routines. * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com> * (c) 2002,2003 Andi Kleen, SuSE Labs. * * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> */ #include <linux/init.h> #include <linux/mm.h> #include <linux/delay.h> #include <linux/spinlock.h> #include <linux/export.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/gfp.h> #include <linux/kexec.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/idtentry.h> #include <asm/nmi.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> #include <asm/reboot.h> /* * Some notes on x86 processor bugs affecting SMP operation: * * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. * The Linux implications for SMP are handled as follows: * * Pentium III / [Xeon] * None of the E1AP-E3AP errata are visible to the user. * * E1AP. see PII A1AP * E2AP. see PII A2AP * E3AP. see PII A3AP * * Pentium II / [Xeon] * None of the A1AP-A3AP errata are visible to the user. * * A1AP. see PPro 1AP * A2AP. see PPro 2AP * A3AP. see PPro 7AP * * Pentium Pro * None of 1AP-9AP errata are visible to the normal user, * except occasional delivery of 'spurious interrupt' as trap #15. * This is very rare and a non-problem. * * 1AP. Linux maps APIC as non-cacheable * 2AP. worked around in hardware * 3AP. fixed in C0 and above steppings microcode update. * Linux does not use excessive STARTUP_IPIs. * 4AP. worked around in hardware * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * * Pentium * There is a marginal case where REP MOVS on 100MHz SMP * machines with B stepping processors can fail. XXX should provide * an L1cache=Writethrough or L1cache=off option. * * B stepping CPUs may hang. There are hardware work arounds * for this. We warn about it in case your board doesn't have the work * arounds. Basically that's so I can tell anyone with a B stepping * CPU and SMP problems "tough". * * Specific items [From Pentium Processor Specification Update] * * 1AP. Linux doesn't use remote read * 2AP. Linux doesn't trust APIC errors * 3AP. We work around this * 4AP. Linux never generated 3 interrupts of the same priority * to cause a lost local interrupt. * 5AP. Remote read is never used * 6AP. not affected - worked around in hardware * 7AP. not affected - worked around in hardware * 8AP. worked around in hardware - we get explicit CS errors if not * 9AP. only 'noapic' mode affected. Might generate spurious * interrupts, we log only the first one and count the * rest silently. * 10AP. not affected - worked around in hardware * 11AP. Linux reads the APIC between writes to avoid this, as per * the documentation. Make sure you preserve this as it affects * the C stepping chips too. * 12AP. not affected - worked around in hardware * 13AP. not affected - worked around in hardware * 14AP. we always deassert INIT during bootup * 15AP. not affected - worked around in hardware * 16AP. not affected - worked around in hardware * 17AP. not affected - worked around in hardware * 18AP. not affected - worked around in hardware * 19AP. not affected - worked around in BIOS * * If this sounds worrying believe me these bugs are either ___RARE___, * or are signal timing bugs worked around in hardware and there's * about nothing of note with C stepping upwards. */ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) { /* We are registered on stopping cpu too, avoid spurious NMI */ if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; } /* * this function calls the 'stop' function on all other CPUs in the system. */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { apic_eoi(); cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } static int register_stop_handler(void) { return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, NMI_FLAG_FIRST, "smp_stop"); } static void native_stop_other_cpus(int wait) { unsigned int old_cpu, this_cpu; unsigned long flags, timeout; if (reboot_force) return; /* Only proceed if this is the first CPU to reach this code */ old_cpu = -1; this_cpu = smp_processor_id(); if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu)) return; /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ if (kexec_in_progress) smp_kick_mwait_play_dead(); /* * 1) Send an IPI on the reboot vector to all other CPUs. * * The other CPUs should react on it after leaving critical * sections and re-enabling interrupts. They might still hold * locks, but there is nothing which can be done about that. * * 2) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * * 3) If #2 timed out send an NMI to the CPUs which did not * yet report * * 4) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * * #3 can obviously race against a CPU reaching the HLT loop late. * That CPU will have reported already and the "have all CPUs * reached HLT" condition will be true despite the fact that the * other CPU is still handling the NMI. Again, there is no * protection against that as "disabled" APICs still respond to * NMIs. */ cpumask_copy(&cpus_stop_mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, &cpus_stop_mask); if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would * prevent an NMI shutdown attempt in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; while (!cpumask_empty(&cpus_stop_mask) && timeout--) udelay(1); } /* if the REBOOT_VECTOR didn't work, try with the NMI */ if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler * and send the IPI. In any case try to wait for the other * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { unsigned int cpu; pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) __apic_send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--)) udelay(1); } local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); local_irq_restore(flags); /* * Ensure that the cpus_stop_mask cache lines are invalidated on * the other CPUs. See comment vs. SME in stop_this_cpu(). */ cpumask_clear(&cpus_stop_mask); } /* * Reschedule call back. KVM uses this interrupt to force a cpu out of * guest mode. */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { apic_eoi(); trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); trace_reschedule_exit(RESCHEDULE_VECTOR); } DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { apic_eoi(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); } DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { apic_eoi(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); } static int __init nonmi_ipi_setup(char *str) { smp_no_nmi_ipi = true; return 1; } __setup("nonmi_ipi", nonmi_ipi_setup); struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, #if defined(CONFIG_CRASH_DUMP) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, #endif .smp_send_reschedule = native_smp_send_reschedule, .kick_ap_alive = native_kick_ap, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead, .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops);
5 5 1 2 2 2 2 3 1 3 1 4 4 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0-or-later /* * inode.c - basic inode and dentry operations. * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. * * Please see Documentation/filesystems/configfs.rst for more * information. */ #undef DEBUG #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/sched.h> #include <linux/lockdep.h> #include <linux/slab.h> #include <linux/configfs.h> #include "configfs_internal.h" #ifdef CONFIG_LOCKDEP static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; #endif static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; struct iattr * sd_iattr; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; sd_iattr = sd->s_iattr; if (!sd_iattr) { /* setting attributes for the first time, allocate now */ sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); if (!sd_iattr) return -ENOMEM; /* assign default attributes */ sd_iattr->ia_mode = sd->s_mode; sd_iattr->ia_uid = GLOBAL_ROOT_UID; sd_iattr->ia_gid = GLOBAL_ROOT_GID; sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = current_time(inode); sd->s_iattr = sd_iattr; } /* attributes were changed atleast once in past */ error = simple_setattr(idmap, dentry, iattr); if (error) return error; if (ia_valid & ATTR_UID) sd_iattr->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) sd_iattr->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) sd_iattr->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) sd_iattr->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; sd_iattr->ia_mode = sd->s_mode = mode; } return error; } static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) { inode->i_mode = iattr->ia_mode; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; inode_set_atime_to_ts(inode, iattr->ia_atime); inode_set_mtime_to_ts(inode, iattr->ia_mtime); inode_set_ctime_to_ts(inode, iattr->ia_ctime); } struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, struct super_block *s) { struct inode * inode = new_inode(s); if (inode) { inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &ram_aops; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ set_inode_attr(inode, sd->s_iattr); } else set_default_inode_attr(inode, mode); } return inode; } #ifdef CONFIG_LOCKDEP static void configfs_set_inode_lock_class(struct configfs_dirent *sd, struct inode *inode) { int depth = sd->s_depth; if (depth > 0) { if (depth <= ARRAY_SIZE(default_group_class)) { lockdep_set_class(&inode->i_rwsem, &default_group_class[depth - 1]); } else { /* * In practice the maximum level of locking depth is * already reached. Just inform about possible reasons. */ pr_info("Too many levels of inodes for the locking correctness validator.\n"); pr_info("Spurious warnings may appear.\n"); } } } #else /* CONFIG_LOCKDEP */ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, struct inode *inode) { } #endif /* CONFIG_LOCKDEP */ struct inode *configfs_create(struct dentry *dentry, umode_t mode) { struct inode *inode = NULL; struct configfs_dirent *sd; struct inode *p_inode; if (!dentry) return ERR_PTR(-ENOENT); if (d_really_is_positive(dentry)) return ERR_PTR(-EEXIST); sd = dentry->d_fsdata; inode = configfs_new_inode(mode, sd, dentry->d_sb); if (!inode) return ERR_PTR(-ENOMEM); p_inode = d_inode(dentry->d_parent); inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode)); configfs_set_inode_lock_class(sd, inode); return inode; } /* * Get the name for corresponding element represented by the given configfs_dirent */ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { struct configfs_attribute *attr; BUG_ON(!sd || !sd->s_element); /* These always have a dentry, so use that */ if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) return sd->s_dentry->d_name.name; if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) { attr = sd->s_element; return attr->ca_name; } return NULL; } /* * Unhashes the dentry corresponding to given configfs_dirent * Called with parent inode's i_mutex held. */ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) { struct dentry * dentry = sd->s_dentry; if (dentry) { spin_lock(&dentry->d_lock); if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); simple_unlink(d_inode(parent), dentry); } else spin_unlock(&dentry->d_lock); } }
716 716 713 3 3 716 715 716 716 716 716 716 716 713 713 711 713 12 11 11 12 12 12 12 11 12 12 12 11 12 1 11 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/sort.h> #include "include/apparmor.h" #include "include/cred.h" #include "include/label.h" #include "include/policy.h" #include "include/secid.h" /* * the aa_label represents the set of profiles confining an object * * Labels maintain a reference count to the set of pointers they reference * Labels are ref counted by * tasks and object via the security field/security context off the field * code - will take a ref count on a label if it needs the label * beyond what is possible with an rcu_read_lock. * profiles - each profile is a label * secids - a pinned secid will keep a refcount of the label it is * referencing * objects - inode, files, sockets, ... * * Labels are not ref counted by the label set, so they maybe removed and * freed when no longer in use. * */ #define PROXY_POISON 97 #define LABEL_POISON 100 static void free_proxy(struct aa_proxy *proxy) { if (proxy) { /* p->label will not updated any more as p is dead */ aa_put_label(rcu_dereference_protected(proxy->label, true)); memset(proxy, 0, sizeof(*proxy)); RCU_INIT_POINTER(proxy->label, (struct aa_label *)PROXY_POISON); kfree(proxy); } } void aa_proxy_kref(struct kref *kref) { struct aa_proxy *proxy = container_of(kref, struct aa_proxy, count); free_proxy(proxy); } struct aa_proxy *aa_alloc_proxy(struct aa_label *label, gfp_t gfp) { struct aa_proxy *new; new = kzalloc(sizeof(struct aa_proxy), gfp); if (new) { kref_init(&new->count); rcu_assign_pointer(new->label, aa_get_label(label)); } return new; } /* requires profile list write lock held */ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new) { struct aa_label *tmp; AA_BUG(!orig); AA_BUG(!new); lockdep_assert_held_write(&labels_set(orig)->lock); tmp = rcu_dereference_protected(orig->proxy->label, &labels_ns(orig)->lock); rcu_assign_pointer(orig->proxy->label, aa_get_label(new)); orig->flags |= FLAG_STALE; aa_put_label(tmp); } static void __proxy_share(struct aa_label *old, struct aa_label *new) { struct aa_proxy *proxy = new->proxy; new->proxy = aa_get_proxy(old->proxy); __aa_proxy_redirect(old, new); aa_put_proxy(proxy); } /** * ns_cmp - compare ns for label set ordering * @a: ns to compare (NOT NULL) * @b: ns to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int ns_cmp(struct aa_ns *a, struct aa_ns *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b) return 0; res = a->level - b->level; if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * profile_cmp - profile comparison for set ordering * @a: profile to compare (NOT NULL) * @b: profile to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int profile_cmp(struct aa_profile *a, struct aa_profile *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->ns); AA_BUG(!b->ns); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b || a->base.hname == b->base.hname) return 0; res = ns_cmp(a->ns, b->ns); if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * vec_cmp - label comparison for set ordering * @a: aa_profile to compare (NOT NULL) * @an: length of @a * @b: aa_profile to compare (NOT NULL) * @bn: length of @b * * Returns: <0 if @a < @b * ==0 if @a == @b * >0 if @a > @b */ static int vec_cmp(struct aa_profile **a, int an, struct aa_profile **b, int bn) { int i; AA_BUG(!a); AA_BUG(!*a); AA_BUG(!b); AA_BUG(!*b); AA_BUG(an <= 0); AA_BUG(bn <= 0); for (i = 0; i < an && i < bn; i++) { int res = profile_cmp(a[i], b[i]); if (res != 0) return res; } return an - bn; } static bool vec_is_stale(struct aa_profile **vec, int n) { int i; AA_BUG(!vec); for (i = 0; i < n; i++) { if (profile_is_stale(vec[i])) return true; } return false; } static long accum_vec_flags(struct aa_profile **vec, int n) { long u = FLAG_UNCONFINED; int i; AA_BUG(!vec); for (i = 0; i < n; i++) { u |= vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | FLAG_STALE); if (!(u & vec[i]->label.flags & FLAG_UNCONFINED)) u &= ~FLAG_UNCONFINED; } return u; } static int sort_cmp(const void *a, const void *b) { return profile_cmp(*(struct aa_profile **)a, *(struct aa_profile **)b); } /* * assumes vec is sorted * Assumes @vec has null terminator at vec[n], and will null terminate * vec[n - dups] */ static inline int unique(struct aa_profile **vec, int n) { int i, pos, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); pos = 0; for (i = 1; i < n; i++) { int res = profile_cmp(vec[pos], vec[i]); AA_BUG(res > 0, "vec not sorted"); if (res == 0) { /* drop duplicate */ aa_put_profile(vec[i]); dups++; continue; } pos++; if (dups) vec[pos] = vec[i]; } AA_BUG(dups < 0); return dups; } /** * aa_vec_unique - canonical sort and unique a list of profiles * @n: number of refcounted profiles in the list (@n > 0) * @vec: list of profiles to sort and merge * @flags: null terminator flags of @vec * * Returns: the number of duplicates eliminated == references put * * If @flags & VEC_FLAG_TERMINATE @vec has null terminator at vec[n], and will * null terminate vec[n - dups] */ int aa_vec_unique(struct aa_profile **vec, int n, int flags) { int i, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); /* vecs are usually small and inorder, have a fallback for larger */ if (n > 8) { sort(vec, n, sizeof(struct aa_profile *), sort_cmp, NULL); dups = unique(vec, n); goto out; } /* insertion sort + unique in one */ for (i = 1; i < n; i++) { struct aa_profile *tmp = vec[i]; int pos, j; for (pos = i - 1 - dups; pos >= 0; pos--) { int res = profile_cmp(vec[pos], tmp); if (res == 0) { /* drop duplicate entry */ aa_put_profile(tmp); dups++; goto continue_outer; } else if (res < 0) break; } /* pos is at entry < tmp, or index -1. Set to insert pos */ pos++; for (j = i - dups; j > pos; j--) vec[j] = vec[j - 1]; vec[pos] = tmp; continue_outer: ; } AA_BUG(dups < 0); out: if (flags & VEC_FLAG_TERMINATE) vec[n - dups] = NULL; return dups; } void aa_label_destroy(struct aa_label *label) { AA_BUG(!label); if (!label_isprofile(label)) { struct aa_profile *profile; struct label_it i; aa_put_str(label->hname); label_for_each(i, label, profile) { aa_put_profile(profile); label->vec[i.i] = (struct aa_profile *) (LABEL_POISON + (long) i.i); } } if (label->proxy) { if (rcu_dereference_protected(label->proxy->label, true) == label) rcu_assign_pointer(label->proxy->label, NULL); aa_put_proxy(label->proxy); } aa_free_secid(label->secid); label->proxy = (struct aa_proxy *) PROXY_POISON + 1; } void aa_label_free(struct aa_label *label) { if (!label) return; aa_label_destroy(label); kfree(label); } static void label_free_switch(struct aa_label *label) { if (label->flags & FLAG_NS_COUNT) aa_free_ns(labels_ns(label)); else if (label_isprofile(label)) aa_free_profile(labels_profile(label)); else aa_label_free(label); } static void label_free_rcu(struct rcu_head *head) { struct aa_label *label = container_of(head, struct aa_label, rcu); if (label->flags & FLAG_IN_TREE) (void) aa_label_remove(label); label_free_switch(label); } void aa_label_kref(struct kref *kref) { struct aa_label *label = container_of(kref, struct aa_label, count); struct aa_ns *ns = labels_ns(label); if (!ns) { /* never live, no rcu callback needed, just using the fn */ label_free_switch(label); return; } /* TODO: update labels_profile macro so it works here */ AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.profiles)); AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.list)); /* TODO: if compound label and not stale add to reclaim cache */ call_rcu(&label->rcu, label_free_rcu); } static void label_free_or_put_new(struct aa_label *label, struct aa_label *new) { if (label != new) /* need to free directly to break circular ref with proxy */ aa_label_free(new); else aa_put_label(new); } bool aa_label_init(struct aa_label *label, int size, gfp_t gfp) { AA_BUG(!label); AA_BUG(size < 1); if (aa_alloc_secid(label, gfp) < 0) return false; label->size = size; /* doesn't include null */ label->vec[size] = NULL; /* null terminate */ kref_init(&label->count); RB_CLEAR_NODE(&label->node); return true; } /** * aa_label_alloc - allocate a label with a profile vector of @size length * @size: size of profile vector in the label * @proxy: proxy to use OR null if to allocate a new one * @gfp: memory allocation type * * Returns: new label * else NULL if failed */ struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp) { struct aa_label *new; AA_BUG(size < 1); /* + 1 for null terminator entry on vec */ new = kzalloc(struct_size(new, vec, size + 1), gfp); AA_DEBUG("%s (%p)\n", __func__, new); if (!new) goto fail; if (!aa_label_init(new, size, gfp)) goto fail; if (!proxy) { proxy = aa_alloc_proxy(new, gfp); if (!proxy) goto fail; } else aa_get_proxy(proxy); /* just set new's proxy, don't redirect proxy here if it was passed in*/ new->proxy = proxy; return new; fail: kfree(new); return NULL; } /** * label_cmp - label comparison for set ordering * @a: label to compare (NOT NULL) * @b: label to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_cmp(struct aa_label *a, struct aa_label *b) { AA_BUG(!b); if (a == b) return 0; return vec_cmp(a->vec, a->size, b->vec, b->size); } /* helper fn for label_for_each_confined */ int aa_label_next_confined(struct aa_label *label, int i) { AA_BUG(!label); AA_BUG(i < 0); for (; i < label->size; i++) { if (!profile_unconfined(label->vec[i])) return i; } return i; } /** * __aa_label_next_not_in_set - return the next profile of @sub not in @set * @I: label iterator * @set: label to test against * @sub: label to if is subset of @set * * Returns: profile in @sub that is not in @set, with iterator set pos after * else NULL if @sub is a subset of @set */ struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub) { AA_BUG(!set); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > set->size); AA_BUG(!sub); AA_BUG(I->j < 0); AA_BUG(I->j > sub->size); while (I->j < sub->size && I->i < set->size) { int res = profile_cmp(sub->vec[I->j], set->vec[I->i]); if (res == 0) { (I->j)++; (I->i)++; } else if (res > 0) (I->i)++; else return sub->vec[(I->j)++]; } if (I->j < sub->size) return sub->vec[(I->j)++]; return NULL; } /** * aa_label_is_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * Returns: true if @sub is subset of @set * else false */ bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; return __aa_label_next_not_in_set(&i, set, sub) == NULL; } /** * aa_label_is_unconfined_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * This checks for subset but taking into account unconfined. IF * @sub contains an unconfined profile that does not have a matching * unconfined in @set then this will not cause the test to fail. * Conversely we don't care about an unconfined in @set that is not in * @sub * * Returns: true if @sub is special_subset of @set * else false */ bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; struct aa_profile *p; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; do { p = __aa_label_next_not_in_set(&i, set, sub); if (p && !profile_unconfined(p)) break; } while (p); return p == NULL; } /** * __label_remove - remove @label from the label set * @label: label to remove * @new: label to redirect to * * Requires: labels_set(@label)->lock write_lock * Returns: true if the label was in the tree and removed */ static bool __label_remove(struct aa_label *label, struct aa_label *new) { struct aa_labelset *ls = labels_set(label); AA_BUG(!ls); AA_BUG(!label); lockdep_assert_held_write(&ls->lock); if (new) __aa_proxy_redirect(label, new); if (!label_is_stale(label)) __label_make_stale(label); if (label->flags & FLAG_IN_TREE) { rb_erase(&label->node, &ls->root); label->flags &= ~FLAG_IN_TREE; return true; } return false; } /** * __label_replace - replace @old with @new in label set * @old: label to remove from label set * @new: label to replace @old with * * Requires: labels_set(@old)->lock write_lock * valid ref count be held on @new * Returns: true if @old was in set and replaced by @new * * Note: current implementation requires label set be order in such a way * that @new directly replaces @old position in the set (ie. * using pointer comparison of the label address would not work) */ static bool __label_replace(struct aa_label *old, struct aa_label *new) { struct aa_labelset *ls = labels_set(old); AA_BUG(!ls); AA_BUG(!old); AA_BUG(!new); lockdep_assert_held_write(&ls->lock); AA_BUG(new->flags & FLAG_IN_TREE); if (!label_is_stale(old)) __label_make_stale(old); if (old->flags & FLAG_IN_TREE) { rb_replace_node(&old->node, &new->node, &ls->root); old->flags &= ~FLAG_IN_TREE; new->flags |= FLAG_IN_TREE; return true; } return false; } /** * __label_insert - attempt to insert @l into a label set * @ls: set of labels to insert @l into (NOT NULL) * @label: new label to insert (NOT NULL) * @replace: whether insertion should replace existing entry that is not stale * * Requires: @ls->lock * caller to hold a valid ref on l * if @replace is true l has a preallocated proxy associated * Returns: @l if successful in inserting @l - with additional refcount * else ref counted equivalent label that is already in the set, * the else condition only happens if @replace is false */ static struct aa_label *__label_insert(struct aa_labelset *ls, struct aa_label *label, bool replace) { struct rb_node **new, *parent = NULL; AA_BUG(!ls); AA_BUG(!label); AA_BUG(labels_set(label) != ls); lockdep_assert_held_write(&ls->lock); AA_BUG(label->flags & FLAG_IN_TREE); /* Figure out where to put new node */ new = &ls->root.rb_node; while (*new) { struct aa_label *this = rb_entry(*new, struct aa_label, node); int result = label_cmp(label, this); parent = *new; if (result == 0) { /* !__aa_get_label means queued for destruction, * so replace in place, however the label has * died before the replacement so do not share * the proxy */ if (!replace && !label_is_stale(this)) { if (__aa_get_label(this)) return this; } else __proxy_share(this, label); AA_BUG(!__label_replace(this, label)); return aa_get_label(label); } else if (result < 0) new = &((*new)->rb_left); else /* (result > 0) */ new = &((*new)->rb_right); } /* Add new node and rebalance tree. */ rb_link_node(&label->node, parent, new); rb_insert_color(&label->node, &ls->root); label->flags |= FLAG_IN_TREE; return aa_get_label(label); } /** * __vec_find - find label that matches @vec in label set * @vec: vec of profiles to find matching label for (NOT NULL) * @n: length of @vec * * Requires: @vec_labelset(vec) lock held * caller to hold a valid ref on l * * Returns: ref counted @label if matching label is in tree * ref counted label that is equiv to @l in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *__vec_find(struct aa_profile **vec, int n) { struct rb_node *node; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); node = vec_labelset(vec, n)->root.rb_node; while (node) { struct aa_label *this = rb_entry(node, struct aa_label, node); int result = vec_cmp(this->vec, this->size, vec, n); if (result > 0) node = node->rb_left; else if (result < 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * __label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: labels_set(@label)->lock held * caller to hold a valid ref on l * * Returns: ref counted @label if @label is in tree OR * ref counted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ static struct aa_label *__label_find(struct aa_label *label) { AA_BUG(!label); return __vec_find(label->vec, label->size); } /** * aa_label_remove - remove a label from the labelset * @label: label to remove * * Returns: true if @label was removed from the tree * else @label was not in tree so it could not be removed */ bool aa_label_remove(struct aa_label *label) { struct aa_labelset *ls = labels_set(label); unsigned long flags; bool res; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); res = __label_remove(label, ns_unconfined(labels_ns(label))); write_unlock_irqrestore(&ls->lock, flags); return res; } /** * aa_label_replace - replace a label @old with a new version @new * @old: label to replace * @new: label replacing @old * * Returns: true if @old was in tree and replaced * else @old was not in tree, and @new was not inserted */ bool aa_label_replace(struct aa_label *old, struct aa_label *new) { unsigned long flags; bool res; if (name_is_shared(old, new) && labels_ns(old) == labels_ns(new)) { write_lock_irqsave(&labels_set(old)->lock, flags); if (old->proxy != new->proxy) __proxy_share(old, new); else __aa_proxy_redirect(old, new); res = __label_replace(old, new); write_unlock_irqrestore(&labels_set(old)->lock, flags); } else { struct aa_label *l; struct aa_labelset *ls = labels_set(old); write_lock_irqsave(&ls->lock, flags); res = __label_remove(old, new); if (labels_ns(old) != labels_ns(new)) { write_unlock_irqrestore(&ls->lock, flags); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); } l = __label_insert(ls, new, true); res = (l == new); write_unlock_irqrestore(&ls->lock, flags); aa_put_label(l); } return res; } /** * vec_find - find label @l in label set * @vec: array of profiles to find equiv label for (NOT NULL) * @n: length of @vec * * Returns: refcounted label if @vec equiv is in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *vec_find(struct aa_profile **vec, int n) { struct aa_labelset *ls; struct aa_label *label; unsigned long flags; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); ls = vec_labelset(vec, n); read_lock_irqsave(&ls->lock, flags); label = __vec_find(vec, n); read_unlock_irqrestore(&ls->lock, flags); return label; } /* requires sort and merge done first */ static struct aa_label *vec_create_and_insert_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = NULL; struct aa_labelset *ls; unsigned long flags; struct aa_label *new; int i; AA_BUG(!vec); if (len == 1) return aa_get_label(&vec[0]->label); ls = labels_set(&vec[len - 1]->label); /* TODO: enable when read side is lockless * check if label exists before taking locks */ new = aa_label_alloc(len, NULL, gfp); if (!new) return NULL; for (i = 0; i < len; i++) new->vec[i] = aa_get_profile(vec[i]); write_lock_irqsave(&ls->lock, flags); label = __label_insert(ls, new, false); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(label, new); return label; } struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = vec_find(vec, len); if (label) return label; return vec_create_and_insert_label(vec, len, gfp); } /** * aa_label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: caller to hold a valid ref on l * * Returns: refcounted @label if @label is in tree * refcounted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ struct aa_label *aa_label_find(struct aa_label *label) { AA_BUG(!label); return vec_find(label->vec, label->size); } /** * aa_label_insert - insert label @label into @ls or return existing label * @ls: labelset to insert @label into * @label: label to insert * * Requires: caller to hold a valid ref on @label * * Returns: ref counted @label if successful in inserting @label * else ref counted equivalent label that is already in the set */ struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *label) { struct aa_label *l; unsigned long flags; AA_BUG(!ls); AA_BUG(!label); /* check if label exists before taking lock */ if (!label_is_stale(label)) { read_lock_irqsave(&ls->lock, flags); l = __label_find(label); read_unlock_irqrestore(&ls->lock, flags); if (l) return l; } write_lock_irqsave(&ls->lock, flags); l = __label_insert(ls, label, false); write_unlock_irqrestore(&ls->lock, flags); return l; } /** * aa_label_next_in_merge - find the next profile when merging @a and @b * @I: label iterator * @a: label to merge * @b: label to merge * * Returns: next profile * else null if no more profiles */ struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b) { AA_BUG(!a); AA_BUG(!b); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > a->size); AA_BUG(I->j < 0); AA_BUG(I->j > b->size); if (I->i < a->size) { if (I->j < b->size) { int res = profile_cmp(a->vec[I->i], b->vec[I->j]); if (res > 0) return b->vec[(I->j)++]; if (res == 0) (I->j)++; } return a->vec[(I->i)++]; } if (I->j < b->size) return b->vec[(I->j)++]; return NULL; } /** * label_merge_cmp - cmp of @a merging with @b against @z for set ordering * @a: label to merge then compare (NOT NULL) * @b: label to merge then compare (NOT NULL) * @z: label to compare merge against (NOT NULL) * * Assumes: using the most recent versions of @a, @b, and @z * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_merge_cmp(struct aa_label *a, struct aa_label *b, struct aa_label *z) { struct aa_profile *p = NULL; struct label_it i = { }; int k; AA_BUG(!a); AA_BUG(!b); AA_BUG(!z); for (k = 0; k < z->size && (p = aa_label_next_in_merge(&i, a, b)); k++) { int res = profile_cmp(p, z->vec[k]); if (res != 0) return res; } if (p) return 1; else if (k < z->size) return -1; return 0; } /** * label_merge_insert - create a new label by merging @a and @b * @new: preallocated label to merge into (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: preallocated proxy * * Returns: ref counted label either @new if merge is unique * @a if @b is a subset of @a * @b if @a is a subset of @b * * NOTE: will not use @new if the merge results in @new == @a or @b * * Must be used within labelset write lock to avoid racing with * setting labels stale. */ static struct aa_label *label_merge_insert(struct aa_label *new, struct aa_label *a, struct aa_label *b) { struct aa_label *label; struct aa_labelset *ls; struct aa_profile *next; struct label_it i; unsigned long flags; int k = 0, invcount = 0; bool stale = false; AA_BUG(!a); AA_BUG(a->size < 0); AA_BUG(!b); AA_BUG(b->size < 0); AA_BUG(!new); AA_BUG(new->size < a->size + b->size); label_for_each_in_merge(i, a, b, next) { AA_BUG(!next); if (profile_is_stale(next)) { new->vec[k] = aa_get_newest_profile(next); AA_BUG(!new->vec[k]->label.proxy); AA_BUG(!new->vec[k]->label.proxy->label); if (next->label.proxy != new->vec[k]->label.proxy) invcount++; k++; stale = true; } else new->vec[k++] = aa_get_profile(next); } /* set to actual size which is <= allocated len */ new->size = k; new->vec[k] = NULL; if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { label = aa_get_label(&new->vec[0]->label); return label; } } else if (!stale) { /* * merge could be same as a || b, note: it is not possible * for new->size == a->size == b->size unless a == b */ if (k == a->size) return aa_get_label(a); else if (k == b->size) return aa_get_label(b); } new->flags |= accum_vec_flags(new->vec, new->size); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); label = __label_insert(labels_set(new), new, false); write_unlock_irqrestore(&ls->lock, flags); return label; } /** * labelset_of_merge - find which labelset a merged label should be inserted * @a: label to merge and insert * @b: label to merge and insert * * Returns: labelset that the merged label should be inserted into */ static struct aa_labelset *labelset_of_merge(struct aa_label *a, struct aa_label *b) { struct aa_ns *nsa = labels_ns(a); struct aa_ns *nsb = labels_ns(b); if (ns_cmp(nsa, nsb) <= 0) return &nsa->labels; return &nsb->labels; } /** * __label_find_merge - find label that is equiv to merge of @a and @b * @ls: set of labels to search (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: ls->lock read_lock held * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ static struct aa_label *__label_find_merge(struct aa_labelset *ls, struct aa_label *a, struct aa_label *b) { struct rb_node *node; AA_BUG(!ls); AA_BUG(!a); AA_BUG(!b); if (a == b) return __label_find(a); node = ls->root.rb_node; while (node) { struct aa_label *this = container_of(node, struct aa_label, node); int result = label_merge_cmp(a, b, this); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * aa_label_find_merge - find label that is equiv to merge of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: labels be fully constructed with a valid ns * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b) { struct aa_labelset *ls; struct aa_label *label, *ar = NULL, *br = NULL; unsigned long flags; AA_BUG(!a); AA_BUG(!b); if (label_is_stale(a)) a = ar = aa_get_newest_label(a); if (label_is_stale(b)) b = br = aa_get_newest_label(b); ls = labelset_of_merge(a, b); read_lock_irqsave(&ls->lock, flags); label = __label_find_merge(ls, a, b); read_unlock_irqrestore(&ls->lock, flags); aa_put_label(ar); aa_put_label(br); return label; } /** * aa_label_merge - attempt to insert new merged label of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * @gfp: memory allocation type * * Requires: caller to hold valid refs on @a and @b * labels be fully constructed with a valid ns * * Returns: ref counted new label if successful in inserting merge of a & b * else ref counted equivalent label that is already in the set. * else NULL if could not create label (-ENOMEM) */ struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp) { struct aa_label *label = NULL; AA_BUG(!a); AA_BUG(!b); if (a == b) return aa_get_newest_label(a); /* TODO: enable when read side is lockless * check if label exists before taking locks if (!label_is_stale(a) && !label_is_stale(b)) label = aa_label_find_merge(a, b); */ if (!label) { struct aa_label *new; a = aa_get_newest_label(a); b = aa_get_newest_label(b); /* could use label_merge_len(a, b), but requires double * comparison for small savings */ new = aa_label_alloc(a->size + b->size, NULL, gfp); if (!new) goto out; label = label_merge_insert(new, a, b); label_free_or_put_new(label, new); out: aa_put_label(a); aa_put_label(b); } return label; } /* match a profile and its associated ns component if needed * Assumes visibility test has already been done. * If a subns profile is not to be matched should be prescreened with * visibility test. */ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_profile *tp, aa_state_t state) { const char *ns_name; if (profile->ns == tp->ns) return aa_dfa_match(rules->policy->dfa, state, tp->base.hname); /* try matching with namespace name and then profile */ ns_name = aa_ns_name(profile->ns, tp->ns, true); state = aa_dfa_match_len(rules->policy->dfa, state, ":", 1); state = aa_dfa_match(rules->policy->dfa, state, ns_name); state = aa_dfa_match_len(rules->policy->dfa, state, ":", 1); return aa_dfa_match(rules->policy->dfa, state, tp->base.hname); } /** * label_compound_match - find perms for full compound label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @state: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: perms struct to set * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for A//&B//&C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_compound_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; /* find first subcomponent that is visible */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, state); if (!state) goto fail; goto next; } /* no component visible */ *perms = allperms; return 0; next: label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = aa_dfa_match(rules->policy->dfa, state, "//&"); state = match_component(profile, rules, tp, state); if (!state) goto fail; } *perms = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return state; } /** * label_components_match - find perms for all subcomponents of a label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @start: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: an initialized perms struct to add accumulation to * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for each of A and B and C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_components_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; struct aa_perms tmp; aa_state_t state = 0; /* find first subcomponent to test */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; goto next; } /* no subcomponents visible - no change in perms */ return 0; next: tmp = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; tmp = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return -EACCES; } /** * aa_label_match - do a multi-component label match * @profile: profile to match against (NOT NULL) * @rules: ruleset to search * @label: label to match (NOT NULL) * @state: state to start in * @subns: whether to match subns components * @request: permission request * @perms: Returns computed perms (NOT NULL) * * Returns: the state the match finished in, may be the none matching state */ int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { int error = label_compound_match(profile, rules, label, state, subns, request, perms); if (!error) return error; *perms = allperms; return label_components_match(profile, rules, label, state, subns, request, perms); } /** * aa_update_label_name - update a label to have a stored name * @ns: ns being viewed from (NOT NULL) * @label: label to update (NOT NULL) * @gfp: type of memory allocation * * Requires: labels_set(label) not locked in caller * * note: only updates the label name if it does not have a name already * and if it is in the labelset */ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) { struct aa_labelset *ls; unsigned long flags; char __counted *name; bool res = false; AA_BUG(!ns); AA_BUG(!label); if (label->hname || labels_ns(label) != ns) return res; if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) < 0) return res; ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); if (!label->hname && label->flags & FLAG_IN_TREE) { label->hname = name; res = true; } else aa_put_str(name); write_unlock_irqrestore(&ls->lock, flags); return res; } /* * cached label name is present and visible * @label->hname only exists if label is namespace hierachical */ static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, int flags) { if (label->hname && (!ns || labels_ns(label) == ns) && !(flags & ~FLAG_SHOW_MODE)) return true; return false; } /* helper macro for snprint routines */ #define update_for_len(total, len, size, str) \ do { \ size_t ulen = len; \ \ AA_BUG(len < 0); \ total += ulen; \ ulen = min(ulen, size); \ size -= ulen; \ str += ulen; \ } while (0) /** * aa_profile_snxprint - print a profile name to a buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @view: namespace profile is being viewed from * @profile: profile to view (NOT NULL) * @flags: whether to include the mode string * @prev_ns: last ns printed when used in compound print * * Returns: size of name written or would be written if larger than * available buffer * * Note: will not print anything if the profile is not visible */ static int aa_profile_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_profile *profile, int flags, struct aa_ns **prev_ns) { const char *ns_name = NULL; AA_BUG(!str && size != 0); AA_BUG(!profile); if (!view) view = profiles_ns(profile); if (view != profile->ns && (!prev_ns || (*prev_ns != profile->ns))) { if (prev_ns) *prev_ns = profile->ns; ns_name = aa_ns_name(view, profile->ns, flags & FLAG_VIEW_SUBNS); if (ns_name == aa_hidden_ns_name) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", ns_name); } } if ((flags & FLAG_SHOW_MODE) && profile != profile->ns->unconfined) { const char *modestr = aa_profile_mode_names[profile->mode]; if (ns_name) return snprintf(str, size, ":%s:%s (%s)", ns_name, profile->base.hname, modestr); return snprintf(str, size, "%s (%s)", profile->base.hname, modestr); } if (ns_name) return snprintf(str, size, ":%s:%s", ns_name, profile->base.hname); return snprintf(str, size, "%s", profile->base.hname); } static const char *label_modename(struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct label_it i; int mode = -1, count = 0; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { count++; if (profile == profile->ns->unconfined) /* special case unconfined so stacks with * unconfined don't report as mixed. ie. * profile_foo//&:ns1:unconfined (mixed) */ continue; if (mode == -1) mode = profile->mode; else if (mode != profile->mode) return "mixed"; } } if (count == 0) return "-"; if (mode == -1) /* everything was unconfined */ mode = APPARMOR_UNCONFINED; return aa_profile_mode_names[mode]; } /* if any visible label is not unconfined the display_mode returns true */ static inline bool display_mode(struct aa_ns *ns, struct aa_label *label, int flags) { if ((flags & FLAG_SHOW_MODE)) { struct aa_profile *profile; struct label_it i; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS) && profile != profile->ns->unconfined) return true; } /* only ns->unconfined in set of profiles in ns */ return false; } return false; } /** * aa_label_snxprint - print a label name to a string buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: whether to include the mode string * * Returns: size of name written or would be written if larger than * available buffer * * Note: labels do not have to be strictly hierarchical to the ns as * objects may be shared across different namespaces and thus * pickup labeling from each ns. If a particular part of the * label is not visible it will just be excluded. And if none * of the label is visible "---" will be used. */ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct aa_ns *prev_ns = NULL; struct label_it i; int count = 0, total = 0; ssize_t len; AA_BUG(!str && size != 0); AA_BUG(!label); if (AA_DEBUG_LABEL && (flags & FLAG_ABS_ROOT)) { ns = root_ns; len = snprintf(str, size, "_"); update_for_len(total, len, size, str); } else if (!ns) { ns = labels_ns(label); } label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { if (count > 0) { len = snprintf(str, size, "//&"); update_for_len(total, len, size, str); } len = aa_profile_snxprint(str, size, ns, profile, flags & FLAG_VIEW_SUBNS, &prev_ns); update_for_len(total, len, size, str); count++; } } if (count == 0) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", aa_hidden_ns_name); } /* count == 1 && ... is for backwards compat where the mode * is not displayed for 'unconfined' in the current ns */ if (display_mode(ns, label, flags)) { len = snprintf(str, size, " (%s)", label_modename(ns, label, flags)); update_for_len(total, len, size, str); } return total; } #undef update_for_len /** * aa_label_asxprint - allocate a string buffer and print label into it * @strp: Returns - the allocated buffer with the label name. (NOT NULL) * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = kmalloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } /** * aa_label_acntsxprint - allocate a __counted string buffer and print label * @strp: buffer to write to. * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = aa_str_alloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { const char *str; char *name = NULL; int len; AA_BUG(!ab); AA_BUG(!label); if (!use_label_hname(ns, label, flags) || display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } str = name; } else { str = (char *) label->hname; len = strlen(str); } if (audit_string_contains_control(str, len)) audit_log_n_hex(ab, str, len); else audit_log_n_string(ab, str, len); kfree(name); } void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!f); AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } seq_puts(f, str); kfree(str); } else if (display_mode(ns, label, flags)) seq_printf(f, "%s (%s)", label->hname, label_modename(ns, label, flags)); else seq_puts(f, label->hname); } void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } pr_info("%s", str); kfree(str); } else if (display_mode(ns, label, flags)) pr_info("%s (%s)", label->hname, label_modename(ns, label, flags)); else pr_info("%s", label->hname); } void aa_label_audit(struct audit_buffer *ab, struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xaudit(ab, ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } void aa_label_seq_print(struct seq_file *f, struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_seq_xprint(f, ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } void aa_label_printk(struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xprintk(ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } static int label_count_strn_entries(const char *str, size_t n) { const char *end = str + n; const char *split; int count = 1; AA_BUG(!str); for (split = aa_label_strn_split(str, end - str); split; split = aa_label_strn_split(str, end - str)) { count++; str = split + 3; } return count; } /* * ensure stacks with components like * :ns:A//&B * have :ns: applied to both 'A' and 'B' by making the lookup relative * to the base if the lookup specifies an ns, else making the stacked lookup * relative to the last embedded ns in the string. */ static struct aa_profile *fqlookupn_profile(struct aa_label *base, struct aa_label *currentbase, const char *str, size_t n) { const char *first = skipn_spaces(str, n); if (first && *first == ':') return aa_fqlookupn_profile(base, str, n); return aa_fqlookupn_profile(currentbase, str, n); } /** * aa_label_strn_parse - parse, validate and convert a text string to a label * @base: base label to use for lookups (NOT NULL) * @str: null terminated text string (NOT NULL) * @n: length of str to parse, will stop at \0 if encountered before n * @gfp: allocation type * @create: true if should create compound labels if they don't exist * @force_stack: true if should stack even if no leading & * * Returns: the matching refcounted label if present * else ERRPTR */ struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack) { DEFINE_VEC(profile, vec); struct aa_label *label, *currbase = base; int i, len, stack = 0, error; const char *end = str + n; const char *split; AA_BUG(!base); AA_BUG(!str); str = skipn_spaces(str, n); if (str == NULL || (AA_DEBUG_LABEL && *str == '_' && base != &root_ns->unconfined->label)) return ERR_PTR(-EINVAL); len = label_count_strn_entries(str, end - str); if (*str == '&' || force_stack) { /* stack on top of base */ stack = base->size; len += stack; if (*str == '&') str++; } error = vec_setup(profile, vec, len, gfp); if (error) return ERR_PTR(error); for (i = 0; i < stack; i++) vec[i] = aa_get_profile(base->vec[i]); for (split = aa_label_strn_split(str, end - str), i = stack; split && i < len; i++) { vec[i] = fqlookupn_profile(base, currbase, str, split - str); if (!vec[i]) goto fail; /* * if component specified a new ns it becomes the new base * so that subsequent lookups are relative to it */ if (vec[i]->ns != labels_ns(currbase)) currbase = &vec[i]->label; str = split + 3; split = aa_label_strn_split(str, end - str); } /* last element doesn't have a split */ if (i < len) { vec[i] = fqlookupn_profile(base, currbase, str, end - str); if (!vec[i]) goto fail; } if (len == 1) /* no need to free vec as len < LOCAL_VEC_ENTRIES */ return &vec[0]->label; len -= aa_vec_unique(vec, len, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (len == 1) { label = aa_get_label(&vec[0]->label); goto out; } if (create) label = aa_vec_find_or_create_label(vec, len, gfp); else label = vec_find(vec, len); if (!label) goto fail; out: /* use adjusted len from after vec_unique, not original */ vec_cleanup(profile, vec, len); return label; fail: label = ERR_PTR(-ENOENT); goto out; } struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack) { return aa_label_strn_parse(base, str, strlen(str), gfp, create, force_stack); } /** * aa_labelset_destroy - remove all labels from the label set * @ls: label set to cleanup (NOT NULL) * * Labels that are removed from the set may still exist beyond the set * being destroyed depending on their reference counting */ void aa_labelset_destroy(struct aa_labelset *ls) { struct rb_node *node; unsigned long flags; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); for (node = rb_first(&ls->root); node; node = rb_first(&ls->root)) { struct aa_label *this = rb_entry(node, struct aa_label, node); if (labels_ns(this) != root_ns) __label_remove(this, ns_unconfined(labels_ns(this)->parent)); else __label_remove(this, NULL); } write_unlock_irqrestore(&ls->lock, flags); } /* * @ls: labelset to init (NOT NULL) */ void aa_labelset_init(struct aa_labelset *ls) { AA_BUG(!ls); rwlock_init(&ls->lock); ls->root = RB_ROOT; } static struct aa_label *labelset_next_stale(struct aa_labelset *ls) { struct aa_label *label; struct rb_node *node; unsigned long flags; AA_BUG(!ls); read_lock_irqsave(&ls->lock, flags); __labelset_for_each(ls, node) { label = rb_entry(node, struct aa_label, node); if ((label_is_stale(label) || vec_is_stale(label->vec, label->size)) && __aa_get_label(label)) goto out; } label = NULL; out: read_unlock_irqrestore(&ls->lock, flags); return label; } /** * __label_update - insert updated version of @label into labelset * @label: the label to update/replace * * Returns: new label that is up to date * else NULL on failure * * Requires: @ns lock be held * * Note: worst case is the stale @label does not get updated and has * to be updated at a later time. */ static struct aa_label *__label_update(struct aa_label *label) { struct aa_label *new, *tmp; struct aa_labelset *ls; unsigned long flags; int i, invcount = 0; AA_BUG(!label); AA_BUG(!mutex_is_locked(&labels_ns(label)->lock)); new = aa_label_alloc(label->size, label->proxy, GFP_KERNEL); if (!new) return NULL; /* * while holding the ns_lock will stop profile replacement, removal, * and label updates, label merging and removal can be occurring */ ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); for (i = 0; i < label->size; i++) { AA_BUG(!label->vec[i]); new->vec[i] = aa_get_newest_profile(label->vec[i]); AA_BUG(!new->vec[i]); AA_BUG(!new->vec[i]->label.proxy); AA_BUG(!new->vec[i]->label.proxy->label); if (new->vec[i]->label.proxy != label->vec[i]->label.proxy) invcount++; } /* updated stale label by being removed/renamed from labelset */ if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { tmp = aa_get_label(&new->vec[0]->label); AA_BUG(tmp == label); goto remove; } if (labels_set(label) != labels_set(new)) { write_unlock_irqrestore(&ls->lock, flags); tmp = aa_label_insert(labels_set(new), new); write_lock_irqsave(&ls->lock, flags); goto remove; } } else AA_BUG(labels_ns(label) != labels_ns(new)); tmp = __label_insert(labels_set(label), new, true); remove: /* ensure label is removed, and redirected correctly */ __label_remove(label, tmp); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(tmp, new); return tmp; } /** * __labelset_update - update labels in @ns * @ns: namespace to update labels in (NOT NULL) * * Requires: @ns lock be held * * Walk the labelset ensuring that all labels are up to date and valid * Any label that has a stale component is marked stale and replaced and * by an updated version. * * If failures happen due to memory pressures then stale labels will * be left in place until the next pass. */ static void __labelset_update(struct aa_ns *ns) { struct aa_label *label; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); do { label = labelset_next_stale(&ns->labels); if (label) { struct aa_label *l = __label_update(label); aa_put_label(l); aa_put_label(label); } } while (label); } /** * __aa_labelset_update_subtree - update all labels with a stale component * @ns: ns to start update at (NOT NULL) * * Requires: @ns lock be held * * Invalidates labels based on @p in @ns and any children namespaces. */ void __aa_labelset_update_subtree(struct aa_ns *ns) { struct aa_ns *child; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); __labelset_update(ns); list_for_each_entry(child, &ns->sub_ns, base.list) { mutex_lock_nested(&child->lock, child->level); __aa_labelset_update_subtree(child); mutex_unlock(&child->lock); } }
157 12 12 5 8 68 68 68 10 5 6 10 10 59 59 59 23 22 2 23 10 10 10 3 7 10 10 55 26 26 26 26 26 25 2 48 48 2 2 10 19 16 3 6 2 6 13 10 3 50 50 24 1 1 4 4 52 1007 36 1 1 10 22 3 5 30 8 43 11 4 6 3 6 25 25 9 12 2 3 14 6 6 6 6 7 7 5 2 2 2 1 4 1 3 7 14 14 1 12 6 7 3 6 4 2 5 3 1 13 1 32 6 6 3 4 11 52 52 59 2 58 58 3 1 45 12 50 3 51 4 52 4 59 12 47 16 16 8 8 10 4 4 3 16 16 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/shm.c * Copyright (C) 1992, 1993 Krishna Balasubramanian * Many improvements/fixes by Bruno Haible. * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. * * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * BIGMEM support, Andrea Arcangeli <andrea@suse.de> * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> * HIGHMEM support, Ingo Molnar <mingo@redhat.com> * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * Better ipc lock (kern_ipc_perm.lock) handling * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013. */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <uapi/linux/shm.h> #include <linux/init.h> #include <linux/file.h> #include <linux/mman.h> #include <linux/shmem_fs.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/mount.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/uaccess.h> #include "util.h" struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; struct file *shm_file; unsigned long shm_nattch; unsigned long shm_segsz; time64_t shm_atim; time64_t shm_dtim; time64_t shm_ctim; struct pid *shm_cprid; struct pid *shm_lprid; struct ucounts *mlock_ucounts; /* * The task created the shm object, for * task_lock(shp->shm_creator) */ struct task_struct *shm_creator; /* * List by creator. task_lock(->shm_creator) required for read/write. * If list_empty(), then the creator is dead already. */ struct list_head shm_clist; struct ipc_namespace *ns; } __randomize_layout; /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ #define SHM_LOCKED 02000 /* segment will not be swapped */ struct shm_file_data { int id; struct ipc_namespace *ns; struct file *file; const struct vm_operations_struct *vm_ops; }; #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) static const struct file_operations shm_file_operations; static const struct vm_operations_struct shm_vm_ops; #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) #define shm_unlock(shp) \ ipc_unlock(&(shp)->shm_perm) static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); static void shm_close(struct vm_area_struct *vma); static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp); #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif void shm_init_ns(struct ipc_namespace *ns) { ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; ns->shm_rmid_forced = 0; ns->shm_tot = 0; ipc_init_ids(&shm_ids(ns)); } /* * Called with shm_ids.rwsem (writer) and the shp structure locked. * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); WARN_ON(ns != shp->ns); if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; /* Do not find it any more */ ipc_set_key_private(&shm_ids(ns), &shp->shm_perm); shm_unlock(shp); } else shm_destroy(ns, shp); } #ifdef CONFIG_IPC_NS void shm_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &shm_ids(ns), do_shm_rmid); idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht); } #endif static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); return 0; } pure_initcall(ipc_ns_init); void __init shm_init(void) { ipc_init_proc_interface("sysvipc/shm", #if BITS_PER_LONG <= 32 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", #else " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", #endif IPC_SHM_IDS, sysvipc_shm_proc_show); } static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct shmid_kernel, shm_perm); } static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct shmid_kernel, shm_perm); } /* * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp; rcu_read_lock(); ipcp = ipc_obtain_object_idr(&shm_ids(ns), id); if (IS_ERR(ipcp)) goto err; ipc_lock_object(ipcp); /* * ipc_rmid() may have already freed the ID while ipc_lock_object() * was spinning: here verify that the structure is still valid. * Upon races with RMID, return -EIDRM, thus indicating that * the ID points to a removed identifier. */ if (ipc_valid_object(ipcp)) { /* return a locked ipc object upon success */ return container_of(ipcp, struct shmid_kernel, shm_perm); } ipc_unlock_object(ipcp); ipcp = ERR_PTR(-EIDRM); err: rcu_read_unlock(); /* * Callers of shm_lock() must validate the status of the returned ipc * object pointer and error out as appropriate. */ return ERR_CAST(ipcp); } static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) { rcu_read_lock(); ipc_lock_object(&ipcp->shm_perm); } static void shm_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *ptr = container_of(head, struct kern_ipc_perm, rcu); struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel, shm_perm); security_shm_free(&shp->shm_perm); kfree(shp); } /* * It has to be called with shp locked. * It must be called before ipc_rmid() */ static inline void shm_clist_rm(struct shmid_kernel *shp) { struct task_struct *creator; /* ensure that shm_creator does not disappear */ rcu_read_lock(); /* * A concurrent exit_shm may do a list_del_init() as well. * Just do nothing if exit_shm already did the work */ if (!list_empty(&shp->shm_clist)) { /* * shp->shm_creator is guaranteed to be valid *only* * if shp->shm_clist is not empty. */ creator = shp->shm_creator; task_lock(creator); /* * list_del_init() is a nop if the entry was already removed * from the list. */ list_del_init(&shp->shm_clist); task_unlock(creator); } rcu_read_unlock(); } static inline void shm_rmid(struct shmid_kernel *s) { shm_clist_rm(s); ipc_rmid(&shm_ids(s->ns), &s->shm_perm); } static int __shm_open(struct shm_file_data *sfd) { struct shmid_kernel *shp; shp = shm_lock(sfd->ns, sfd->id); if (IS_ERR(shp)) return PTR_ERR(shp); if (shp->shm_file != sfd->file) { /* ID was reused */ shm_unlock(shp); return -EINVAL; } shp->shm_atim = ktime_get_real_seconds(); ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_nattch++; shm_unlock(shp); return 0; } /* This is called by fork, once for every shm attach. */ static void shm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); int err; /* Always call underlying open if present */ if (sfd->vm_ops->open) sfd->vm_ops->open(vma); err = __shm_open(sfd); /* * We raced in the idr lookup or with shm_destroy(). * Either way, the ID is busted. */ WARN_ON_ONCE(err); } /* * shm_destroy - free the struct shmid_kernel * * @ns: namespace * @shp: struct to free * * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { struct file *shm_file; shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) shmem_lock(shm_file, 0, shp->mlock_ucounts); fput(shm_file); ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); } /* * shm_may_destroy - identifies whether shm segment should be destroyed now * * Returns true if and only if there are no active users of the segment and * one of the following is true: * * 1) shmctl(id, IPC_RMID, NULL) was called for this shp * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); } /* * remove the attach descriptor vma. * free memory for segment if it is marked destroyed. * The descriptor has already been removed from the current->mm->mmap list * and will later be kfree()d. */ static void __shm_close(struct shm_file_data *sfd) { struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ shp = shm_lock(ns, sfd->id); /* * We raced in the idr lookup or with shm_destroy(). * Either way, the ID is busted. */ if (WARN_ON_ONCE(IS_ERR(shp))) goto done; /* no-op */ ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); done: up_write(&shm_ids(ns).rwsem); } static void shm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); /* Always call underlying close if present */ if (sfd->vm_ops->close) sfd->vm_ops->close(vma); __shm_close(sfd); } /* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; struct kern_ipc_perm *ipcp = p; struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); /* * We want to destroy segments without users and with already * exit'ed originating process. * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ if (!list_empty(&shp->shm_clist)) return 0; if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } return 0; } void shm_destroy_orphaned(struct ipc_namespace *ns) { down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); up_write(&shm_ids(ns).rwsem); } /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { for (;;) { struct shmid_kernel *shp; struct ipc_namespace *ns; task_lock(task); if (list_empty(&task->sysvshm.shm_clist)) { task_unlock(task); break; } shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, shm_clist); /* * 1) Get pointer to the ipc namespace. It is worth to say * that this pointer is guaranteed to be valid because * shp lifetime is always shorter than namespace lifetime * in which shp lives. * We taken task_lock it means that shp won't be freed. */ ns = shp->ns; /* * 2) If kernel.shm_rmid_forced is not set then only keep track of * which shmids are orphaned, so that a later set of the sysctl * can clean them up. */ if (!ns->shm_rmid_forced) goto unlink_continue; /* * 3) get a reference to the namespace. * The refcount could be already 0. If it is 0, then * the shm objects will be free by free_ipc_work(). */ ns = get_ipc_ns_not_zero(ns); if (!ns) { unlink_continue: list_del_init(&shp->shm_clist); task_unlock(task); continue; } /* * 4) get a reference to shp. * This cannot fail: shm_clist_rm() is called before * ipc_rmid(), thus the refcount cannot be 0. */ WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); /* * 5) unlink the shm segment from the list of segments * created by current. * This must be done last. After unlinking, * only the refcounts obtained above prevent IPC_RMID * from destroying the segment or the namespace. */ list_del_init(&shp->shm_clist); task_unlock(task); /* * 6) we have all references * Thus lock & if needed destroy shp. */ down_write(&shm_ids(ns).rwsem); shm_lock_by_ptr(shp); /* * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's * safe to call ipc_rcu_putref here */ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); if (ipc_valid_object(&shp->shm_perm)) { if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); } else { /* * Someone else deleted the shp from namespace * idr/kht while we have waited. * Just unlock and continue. */ shm_unlock(shp); } up_write(&shm_ids(ns).rwsem); put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ } } static vm_fault_t shm_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); return sfd->vm_ops->fault(vmf); } static int shm_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); if (sfd->vm_ops->may_split) return sfd->vm_ops->may_split(vma, addr); return 0; } static unsigned long shm_pagesize(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); if (sfd->vm_ops->pagesize) return sfd->vm_ops->pagesize(vma); return PAGE_SIZE; } #ifdef CONFIG_NUMA static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct shm_file_data *sfd = shm_file_data(vma->vm_file); struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) mpol = sfd->vm_ops->get_policy(vma, addr, ilx); return mpol; } #endif static int shm_mmap(struct file *file, struct vm_area_struct *vma) { struct shm_file_data *sfd = shm_file_data(file); int ret; /* * In case of remap_file_pages() emulation, the file can represent an * IPC ID that was removed, and possibly even reused by another shm * segment already. Propagate this case as an error to caller. */ ret = __shm_open(sfd); if (ret) return ret; ret = call_mmap(sfd->file, vma); if (ret) { __shm_close(sfd); return ret; } sfd->vm_ops = vma->vm_ops; #ifdef CONFIG_MMU WARN_ON(!sfd->vm_ops->fault); #endif vma->vm_ops = &shm_vm_ops; return 0; } static int shm_release(struct inode *ino, struct file *file) { struct shm_file_data *sfd = shm_file_data(file); put_ipc_ns(sfd->ns); fput(sfd->file); shm_file_data(file) = NULL; kfree(sfd); return 0; } static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct shm_file_data *sfd = shm_file_data(file); if (!sfd->file->f_op->fsync) return -EINVAL; return sfd->file->f_op->fsync(sfd->file, start, end, datasync); } static long shm_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct shm_file_data *sfd = shm_file_data(file); if (!sfd->file->f_op->fallocate) return -EOPNOTSUPP; return sfd->file->f_op->fallocate(file, mode, offset, len); } static unsigned long shm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct shm_file_data *sfd = shm_file_data(file); return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, pgoff, flags); } static const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, }; /* * shm_file_operations_huge is now identical to shm_file_operations, * but we keep it distinct for the sake of is_file_shm_hugepages(). */ static const struct file_operations shm_file_operations_huge = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, }; bool is_file_shm_hugepages(struct file *file) { return file->f_op == &shm_file_operations_huge; } static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ .fault = shm_fault, .may_split = shm_may_split, .pagesize = shm_pagesize, #if defined(CONFIG_NUMA) .set_policy = shm_set_policy, .get_policy = shm_get_policy, #endif }; /** * newseg - Create a new shared memory segment * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * * Called with shm_ids.rwsem held as a writer. */ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) { key_t key = params->key; int shmflg = params->flg; size_t size = params->u.size; int error; struct shmid_kernel *shp; size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; struct file *file; char name[13]; vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; if (numpages << PAGE_SHIFT < size) return -ENOSPC; if (ns->shm_tot + numpages < ns->shm_tot || ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT); if (unlikely(!shp)) return -ENOMEM; shp->shm_perm.key = key; shp->shm_perm.mode = (shmflg & S_IRWXUGO); shp->mlock_ucounts = NULL; shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); if (error) { kfree(shp); return error; } sprintf(name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { struct hstate *hs; size_t hugesize; hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) { error = -EINVAL; goto no_file; } hugesize = ALIGN(size, huge_page_size(hs)); /* hugetlb_file_setup applies strict accounting */ if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. */ if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) acctflag = VM_NORESERVE; file = shmem_kernel_file_setup(name, size, acctflag); } error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; shp->shm_cprid = get_pid(task_tgid(current)); shp->shm_lprid = NULL; shp->shm_atim = shp->shm_dtim = 0; shp->shm_ctim = ktime_get_real_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; /* ipc_addid() locks shp upon success. */ error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (error < 0) goto no_id; shp->ns = ns; task_lock(current); list_add(&shp->shm_clist, &current->sysvshm.shm_clist); task_unlock(current); /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. */ file_inode(file)->i_ino = shp->shm_perm.id; ns->shm_tot += numpages; error = shp->shm_perm.id; ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); return error; no_id: ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); fput(file); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); return error; no_file: call_rcu(&shp->shm_perm.rcu, shm_rcu_free); return error; } /* * Called with shm_ids.rwsem and ipcp locked. */ static int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); if (shp->shm_segsz < params->u.size) return -EINVAL; return 0; } long ksys_shmget(key_t key, size_t size, int shmflg) { struct ipc_namespace *ns; static const struct ipc_ops shm_ops = { .getnew = newseg, .associate = security_shm_associate, .more_checks = shm_more_checks, }; struct ipc_params shm_params; ns = current->nsproxy->ipc_ns; shm_params.key = key; shm_params.flg = shmflg; shm_params.u.size = size; return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); } SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) { return ksys_shmget(key, size, shmflg); } static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct shmid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); out.shm_segsz = in->shm_segsz; out.shm_atime = in->shm_atime; out.shm_dtime = in->shm_dtime; out.shm_ctime = in->shm_ctime; out.shm_cpid = in->shm_cpid; out.shm_lpid = in->shm_lpid; out.shm_nattch = in->shm_nattch; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static inline unsigned long copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct shmid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->shm_perm.uid = tbuf_old.shm_perm.uid; out->shm_perm.gid = tbuf_old.shm_perm.gid; out->shm_perm.mode = tbuf_old.shm_perm.mode; return 0; } default: return -EINVAL; } } static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct shminfo out; if (in->shmmax > INT_MAX) out.shmmax = INT_MAX; else out.shmmax = (int)in->shmmax; out.shmmin = in->shmmin; out.shmmni = in->shmmni; out.shmseg = in->shmseg; out.shmall = in->shmall; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } /* * Calculate and add used RSS and swap pages of a shm. * Called with shm_ids.rwsem held as a reader */ static void shm_add_rss_swap(struct shmid_kernel *shp, unsigned long *rss_add, unsigned long *swp_add) { struct inode *inode; inode = file_inode(shp->shm_file); if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_file(shp->shm_file); *rss_add += pages_per_huge_page(h) * mapping->nrpages; } else { #ifdef CONFIG_SHMEM struct shmem_inode_info *info = SHMEM_I(inode); spin_lock_irq(&info->lock); *rss_add += inode->i_mapping->nrpages; *swp_add += info->swapped; spin_unlock_irq(&info->lock); #else *rss_add += inode->i_mapping->nrpages; #endif } } /* * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) { int next_id; int total, in_use; *rss = 0; *swp = 0; in_use = shm_ids(ns).in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { struct kern_ipc_perm *ipc; struct shmid_kernel *shp; ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); if (ipc == NULL) continue; shp = container_of(ipc, struct shmid_kernel, shm_perm); shm_add_rss_swap(shp, rss, swp); total++; } } /* * This function handles some shmctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, struct shmid64_ds *shmid64) { struct kern_ipc_perm *ipcp; struct shmid_kernel *shp; int err; down_write(&shm_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd, &shmid64->shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } shp = container_of(ipcp, struct shmid_kernel, shm_perm); err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: ipc_lock_object(&shp->shm_perm); /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: ipc_lock_object(&shp->shm_perm); err = ipc_update_perm(&shmid64->shm_perm, ipcp); if (err) goto out_unlock0; shp->shm_ctim = ktime_get_real_seconds(); break; default: err = -EINVAL; goto out_unlock1; } out_unlock0: ipc_unlock_object(&shp->shm_perm); out_unlock1: rcu_read_unlock(); out_up: up_write(&shm_ids(ns).rwsem); return err; } static int shmctl_ipc_info(struct ipc_namespace *ns, struct shminfo64 *shminfo) { int err = security_shm_shmctl(NULL, IPC_INFO); if (!err) { memset(shminfo, 0, sizeof(*shminfo)); shminfo->shmmni = shminfo->shmseg = ns->shm_ctlmni; shminfo->shmmax = ns->shm_ctlmax; shminfo->shmall = ns->shm_ctlall; shminfo->shmmin = SHMMIN; down_read(&shm_ids(ns).rwsem); err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; } return err; } static int shmctl_shm_info(struct ipc_namespace *ns, struct shm_info *shm_info) { int err = security_shm_shmctl(NULL, SHM_INFO); if (!err) { memset(shm_info, 0, sizeof(*shm_info)); down_read(&shm_ids(ns).rwsem); shm_info->used_ids = shm_ids(ns).in_use; shm_get_stat(ns, &shm_info->shm_rss, &shm_info->shm_swp); shm_info->shm_tot = ns->shm_tot; shm_info->swap_attempts = 0; shm_info->swap_successes = 0; err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; } return err; } static int shmctl_stat(struct ipc_namespace *ns, int shmid, int cmd, struct shmid64_ds *tbuf) { struct shmid_kernel *shp; int err; memset(tbuf, 0, sizeof(*tbuf)); rcu_read_lock(); if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) { shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } } else { /* IPC_STAT */ shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } } /* * Semantically SHM_STAT_ANY ought to be identical to * that functionality provided by the /proc/sysvipc/ * interface. As such, only audit these calls and * do not do traditional S_IRUGO permission checks on * the ipc object. */ if (cmd == SHM_STAT_ANY) audit_ipc_obj(&shp->shm_perm); else { err = -EACCES; if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; } err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&shp->shm_perm); if (!ipc_valid_object(&shp->shm_perm)) { ipc_unlock_object(&shp->shm_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&shp->shm_perm, &tbuf->shm_perm); tbuf->shm_segsz = shp->shm_segsz; tbuf->shm_atime = shp->shm_atim; tbuf->shm_dtime = shp->shm_dtim; tbuf->shm_ctime = shp->shm_ctim; #ifndef CONFIG_64BIT tbuf->shm_atime_high = shp->shm_atim >> 32; tbuf->shm_dtime_high = shp->shm_dtim >> 32; tbuf->shm_ctime_high = shp->shm_ctim >> 32; #endif tbuf->shm_cpid = pid_vnr(shp->shm_cprid); tbuf->shm_lpid = pid_vnr(shp->shm_lprid); tbuf->shm_nattch = shp->shm_nattch; if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * SHM_STAT and SHM_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = shp->shm_perm.id; } ipc_unlock_object(&shp->shm_perm); out_unlock: rcu_read_unlock(); return err; } static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) { struct shmid_kernel *shp; struct file *shm_file; int err; rcu_read_lock(); shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock1; } audit_ipc_obj(&(shp->shm_perm)); err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) goto out_unlock1; ipc_lock_object(&shp->shm_perm); /* check if shm_destroy() is tearing down shp */ if (!ipc_valid_object(&shp->shm_perm)) { err = -EIDRM; goto out_unlock0; } if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); if (!uid_eq(euid, shp->shm_perm.uid) && !uid_eq(euid, shp->shm_perm.cuid)) { err = -EPERM; goto out_unlock0; } if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) { err = -EPERM; goto out_unlock0; } } shm_file = shp->shm_file; if (is_file_hugepages(shm_file)) goto out_unlock0; if (cmd == SHM_LOCK) { struct ucounts *ucounts = current_ucounts(); err = shmem_lock(shm_file, 1, ucounts); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_ucounts = ucounts; } goto out_unlock0; } /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) goto out_unlock0; shmem_lock(shm_file, 0, shp->mlock_ucounts); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_ucounts = NULL; get_file(shm_file); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); shmem_unlock_mapping(shm_file->f_mapping); fput(shm_file); return err; out_unlock0: ipc_unlock_object(&shp->shm_perm); out_unlock1: rcu_read_unlock(); return err; } static long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf, int version) { int err; struct ipc_namespace *ns; struct shmid64_ds sem64; if (cmd < 0 || shmid < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; err = shmctl_ipc_info(ns, &shminfo); if (err < 0) return err; if (copy_shminfo_to_user(buf, &shminfo, version)) err = -EFAULT; return err; } case SHM_INFO: { struct shm_info shm_info; err = shmctl_shm_info(ns, &shm_info); if (err < 0) return err; if (copy_to_user(buf, &shm_info, sizeof(shm_info))) err = -EFAULT; return err; } case SHM_STAT: case SHM_STAT_ANY: case IPC_STAT: { err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) return err; if (copy_shmid_to_user(buf, &sem64, version)) err = -EFAULT; return err; } case IPC_SET: if (copy_shmid_from_user(&sem64, buf, version)) return -EFAULT; fallthrough; case IPC_RMID: return shmctl_down(ns, shmid, cmd, &sem64); case SHM_LOCK: case SHM_UNLOCK: return shmctl_do_lock(ns, shmid, cmd); default: return -EINVAL; } } SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { return ksys_shmctl(shmid, cmd, buf, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) { int version = ipc_parse_version(&cmd); return ksys_shmctl(shmid, cmd, buf, version); } SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { return ksys_old_shmctl(shmid, cmd, buf); } #endif #ifdef CONFIG_COMPAT struct compat_shmid_ds { struct compat_ipc_perm shm_perm; int shm_segsz; old_time32_t shm_atime; old_time32_t shm_dtime; old_time32_t shm_ctime; compat_ipc_pid_t shm_cpid; compat_ipc_pid_t shm_lpid; unsigned short shm_nattch; unsigned short shm_unused; compat_uptr_t shm_unused2; compat_uptr_t shm_unused3; }; struct compat_shminfo64 { compat_ulong_t shmmax; compat_ulong_t shmmin; compat_ulong_t shmmni; compat_ulong_t shmseg; compat_ulong_t shmall; compat_ulong_t __unused1; compat_ulong_t __unused2; compat_ulong_t __unused3; compat_ulong_t __unused4; }; struct compat_shm_info { compat_int_t used_ids; compat_ulong_t shm_tot, shm_rss, shm_swp; compat_ulong_t swap_attempts, swap_successes; }; static int copy_compat_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) { if (in->shmmax > INT_MAX) in->shmmax = INT_MAX; if (version == IPC_64) { struct compat_shminfo64 info; memset(&info, 0, sizeof(info)); info.shmmax = in->shmmax; info.shmmin = in->shmmin; info.shmmni = in->shmmni; info.shmseg = in->shmseg; info.shmall = in->shmall; return copy_to_user(buf, &info, sizeof(info)); } else { struct shminfo info; memset(&info, 0, sizeof(info)); info.shmmax = in->shmmax; info.shmmin = in->shmmin; info.shmmni = in->shmmni; info.shmseg = in->shmseg; info.shmall = in->shmall; return copy_to_user(buf, &info, sizeof(info)); } } static int put_compat_shm_info(struct shm_info *ip, struct compat_shm_info __user *uip) { struct compat_shm_info info; memset(&info, 0, sizeof(info)); info.used_ids = ip->used_ids; info.shm_tot = ip->shm_tot; info.shm_rss = ip->shm_rss; info.shm_swp = ip->shm_swp; info.swap_attempts = ip->swap_attempts; info.swap_successes = ip->swap_successes; return copy_to_user(uip, &info, sizeof(info)); } static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) { if (version == IPC_64) { struct compat_shmid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.shm_perm, &in->shm_perm); v.shm_atime = lower_32_bits(in->shm_atime); v.shm_atime_high = upper_32_bits(in->shm_atime); v.shm_dtime = lower_32_bits(in->shm_dtime); v.shm_dtime_high = upper_32_bits(in->shm_dtime); v.shm_ctime = lower_32_bits(in->shm_ctime); v.shm_ctime_high = upper_32_bits(in->shm_ctime); v.shm_segsz = in->shm_segsz; v.shm_nattch = in->shm_nattch; v.shm_cpid = in->shm_cpid; v.shm_lpid = in->shm_lpid; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_shmid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.shm_perm, &in->shm_perm); v.shm_perm.key = in->shm_perm.key; v.shm_atime = in->shm_atime; v.shm_dtime = in->shm_dtime; v.shm_ctime = in->shm_ctime; v.shm_segsz = in->shm_segsz; v.shm_nattch = in->shm_nattch; v.shm_cpid = in->shm_cpid; v.shm_lpid = in->shm_lpid; return copy_to_user(buf, &v, sizeof(v)); } } static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_shmid64_ds __user *p = buf; return get_compat_ipc64_perm(&out->shm_perm, &p->shm_perm); } else { struct compat_shmid_ds __user *p = buf; return get_compat_ipc_perm(&out->shm_perm, &p->shm_perm); } } static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; struct shmid64_ds sem64; int err; ns = current->nsproxy->ipc_ns; if (cmd < 0 || shmid < 0) return -EINVAL; switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; err = shmctl_ipc_info(ns, &shminfo); if (err < 0) return err; if (copy_compat_shminfo_to_user(uptr, &shminfo, version)) err = -EFAULT; return err; } case SHM_INFO: { struct shm_info shm_info; err = shmctl_shm_info(ns, &shm_info); if (err < 0) return err; if (put_compat_shm_info(&shm_info, uptr)) err = -EFAULT; return err; } case IPC_STAT: case SHM_STAT_ANY: case SHM_STAT: err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) return err; if (copy_compat_shmid_to_user(uptr, &sem64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_compat_shmid_from_user(&sem64, uptr, version)) return -EFAULT; fallthrough; case IPC_RMID: return shmctl_down(ns, shmid, cmd, &sem64); case SHM_LOCK: case SHM_UNLOCK: return shmctl_do_lock(ns, shmid, cmd); default: return -EINVAL; } return err; } COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr) { return compat_ksys_shmctl(shmid, cmd, uptr, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_shmctl(shmid, cmd, uptr, version); } COMPAT_SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, void __user *, uptr) { return compat_ksys_old_shmctl(shmid, cmd, uptr); } #endif #endif /* * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. * * NOTE! Despite the name, this is NOT a direct system call entrypoint. The * "raddr" thing points to kernel space, and there has to be a wrapper around * this. */ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long shmlba) { struct shmid_kernel *shp; unsigned long addr = (unsigned long)shmaddr; unsigned long size; struct file *file, *base; int err; unsigned long flags = MAP_SHARED; unsigned long prot; int acc_mode; struct ipc_namespace *ns; struct shm_file_data *sfd; int f_flags; unsigned long populate = 0; err = -EINVAL; if (shmid < 0) goto out; if (addr) { if (addr & (shmlba - 1)) { if (shmflg & SHM_RND) { addr &= ~(shmlba - 1); /* round down */ /* * Ensure that the round-down is non-nil * when remapping. This can happen for * cases when addr < shmlba. */ if (!addr && (shmflg & SHM_REMAP)) goto out; } else #ifndef __ARCH_FORCE_SHMLBA if (addr & ~PAGE_MASK) #endif goto out; } flags |= MAP_FIXED; } else if ((shmflg & SHM_REMAP)) goto out; if (shmflg & SHM_RDONLY) { prot = PROT_READ; acc_mode = S_IRUGO; f_flags = O_RDONLY; } else { prot = PROT_READ | PROT_WRITE; acc_mode = S_IRUGO | S_IWUGO; f_flags = O_RDWR; } if (shmflg & SHM_EXEC) { prot |= PROT_EXEC; acc_mode |= S_IXUGO; } /* * We cannot rely on the fs check since SYSV IPC does have an * additional creator id... */ ns = current->nsproxy->ipc_ns; rcu_read_lock(); shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } err = -EACCES; if (ipcperms(ns, &shp->shm_perm, acc_mode)) goto out_unlock; err = security_shm_shmat(&shp->shm_perm, shmaddr, shmflg); if (err) goto out_unlock; ipc_lock_object(&shp->shm_perm); /* check if shm_destroy() is tearing down shp */ if (!ipc_valid_object(&shp->shm_perm)) { ipc_unlock_object(&shp->shm_perm); err = -EIDRM; goto out_unlock; } /* * We need to take a reference to the real shm file to prevent the * pointer from becoming stale in cases where the lifetime of the outer * file extends beyond that of the shm segment. It's not usually * possible, but it can happen during remap_file_pages() emulation as * that unmaps the memory, then does ->mmap() via file reference only. * We'll deny the ->mmap() if the shm segment was since removed, but to * detect shm ID reuse we need to compare the file pointers. */ base = get_file(shp->shm_file); shp->shm_nattch++; size = i_size_read(file_inode(base)); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); if (!sfd) { fput(base); goto out_nattch; } file = alloc_file_clone(base, f_flags, is_file_hugepages(base) ? &shm_file_operations_huge : &shm_file_operations); err = PTR_ERR(file); if (IS_ERR(file)) { kfree(sfd); fput(base); goto out_nattch; } sfd->id = shp->shm_perm.id; sfd->ns = get_ipc_ns(ns); sfd->file = base; sfd->vm_ops = NULL; file->private_data = sfd; err = security_mmap_file(file, prot, flags); if (err) goto out_fput; if (mmap_write_lock_killable(current->mm)) { err = -EINTR; goto out_fput; } if (addr && !(shmflg & SHM_REMAP)) { err = -EINVAL; if (addr + size < addr) goto invalid; if (find_vma_intersection(current->mm, addr, addr + size)) goto invalid; } addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL); *raddr = addr; err = 0; if (IS_ERR_VALUE(addr)) err = (long)addr; invalid: mmap_write_unlock(current->mm); if (populate) mm_populate(addr, populate); out_fput: fput(file); out_nattch: down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); up_write(&shm_ids(ns).rwsem); return err; out_unlock: rcu_read_unlock(); out: return err; } SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) { unsigned long ret; long err; err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); if (err) return err; force_successful_syscall_return(); return (long)ret; } #ifdef CONFIG_COMPAT #ifndef COMPAT_SHMLBA #define COMPAT_SHMLBA SHMLBA #endif COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg) { unsigned long ret; long err; err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret, COMPAT_SHMLBA); if (err) return err; force_successful_syscall_return(); return (long)ret; } #endif /* * detach and kill segment if marked destroyed. * The work is done in shm_close. */ long ksys_shmdt(char __user *shmaddr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr = (unsigned long)shmaddr; int retval = -EINVAL; #ifdef CONFIG_MMU loff_t size = 0; struct file *file; VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) return retval; if (mmap_write_lock_killable(mm)) return -EINTR; /* * This function tries to be smart and unmap shm segments that * were modified by partial mlock or munmap calls: * - It first determines the size of the shm segment that should be * unmapped: It searches for a vma that is backed by shm and that * started at address shmaddr. It records it's size and then unmaps * it. * - Then it unmaps all shm vmas that started at shmaddr and that * are within the initially determined size and that are from the * same shm segment from which we determined the size. * Errors from do_munmap are ignored: the function only fails if * it's called with invalid parameters or if it's called to unmap * a part of a vma. Both calls in this function are for full vmas, * the parameters are directly copied from the vma itself and always * valid - therefore do_munmap cannot fail. (famous last words?) */ /* * If it had been mremap()'d, the starting address would not * match the usual checks anyway. So assume all vma's are * above the starting address given. */ #ifdef CONFIG_MMU for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it * otherwise it starts at this address with no hassles. */ if ((vma->vm_ops == &shm_vm_ops) && (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { /* * Record the file of the shm segment being * unmapped. With mremap(), someone could place * page from another segment but with equal offsets * in the range we are unmapping. */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next * loop that uses the size information to stop * searching for matching vma's. */ retval = 0; vma = vma_next(&vmi); break; } } /* * We need look no further than the maximum address a fragment * could possibly have landed at. Also cast things to loff_t to * prevent overflows and make comparisons vs. equal-width types. */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && (vma->vm_file == file)) { do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, NULL, false); } vma = vma_next(&vmi); } #else /* CONFIG_MMU */ vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); retval = 0; } #endif mmap_write_unlock(mm); return retval; } SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) { return ksys_shmdt(shmaddr); } #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it) { struct pid_namespace *pid_ns = ipc_seq_pid_ns(s); struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct shmid_kernel *shp; unsigned long rss = 0, swp = 0; shp = container_of(ipcp, struct shmid_kernel, shm_perm); shm_add_rss_swap(shp, &rss, &swp); #if BITS_PER_LONG <= 32 #define SIZE_SPEC "%10lu" #else #define SIZE_SPEC "%21lu" #endif seq_printf(s, "%10d %10d %4o " SIZE_SPEC " %5u %5u " "%5lu %5u %5u %5u %5u %10llu %10llu %10llu " SIZE_SPEC " " SIZE_SPEC "\n", shp->shm_perm.key, shp->shm_perm.id, shp->shm_perm.mode, shp->shm_segsz, pid_nr_ns(shp->shm_cprid, pid_ns), pid_nr_ns(shp->shm_lprid, pid_ns), shp->shm_nattch, from_kuid_munged(user_ns, shp->shm_perm.uid), from_kgid_munged(user_ns, shp->shm_perm.gid), from_kuid_munged(user_ns, shp->shm_perm.cuid), from_kgid_munged(user_ns, shp->shm_perm.cgid), shp->shm_atim, shp->shm_dtim, shp->shm_ctim, rss * PAGE_SIZE, swp * PAGE_SIZE); return 0; } #endif
59 1 4 1864 1 1 1 1 1 4 4 4 4 1865 1864 1865 5 4 4 1 1 1 1 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Interface registration. * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland * * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont * and Sakari Ailus <sakari.ailus@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/spinlock.h> #include <net/netns/generic.h> #include <net/net_namespace.h> #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support"); MODULE_LICENSE("GPL"); /* Used for local tracking of the CAIF net devices */ struct caif_device_entry { struct cflayer layer; struct list_head list; struct net_device *netdev; int __percpu *pcpu_refcnt; spinlock_t flow_lock; struct sk_buff *xoff_skb; void (*xoff_skb_dtor)(struct sk_buff *skb); bool xoff; }; struct caif_device_entry_list { struct list_head list; /* Protects simulanous deletes in list */ struct mutex lock; }; struct caif_net { struct cfcnfg *cfg; struct caif_device_entry_list caifdevs; }; static unsigned int caif_net_id; static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return &caifn->caifdevs; } static void caifd_put(struct caif_device_entry *e) { this_cpu_dec(*e->pcpu_refcnt); } static void caifd_hold(struct caif_device_entry *e) { this_cpu_inc(*e->pcpu_refcnt); } static int caifd_refcnt_read(struct caif_device_entry *e) { int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(e->pcpu_refcnt, i); return refcnt; } /* Allocate new CAIF device. */ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) { struct caif_device_entry *caifd; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) return NULL; caifd->pcpu_refcnt = alloc_percpu(int); if (!caifd->pcpu_refcnt) { kfree(caifd); return NULL; } caifd->netdev = dev; dev_hold(dev); return caifd; } static struct caif_device_entry *caif_get(struct net_device *dev) { struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; list_for_each_entry_rcu(caifd, &caifdevs->list, list, lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } return NULL; } static void caif_flow_cb(struct sk_buff *skb) { struct caif_device_entry *caifd; void (*dtor)(struct sk_buff *skb) = NULL; bool send_xoff; WARN_ON(skb->dev == NULL); rcu_read_lock(); caifd = caif_get(skb->dev); WARN_ON(caifd == NULL); if (!caifd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) skb = NULL; caifd->xoff_skb = NULL; caifd->xoff_skb_dtor = NULL; spin_unlock_bh(&caifd->flow_lock); if (dtor && skb) dtor(skb); if (send_xoff) caifd->layer.up-> ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, caifd->layer.id); caifd_put(caifd); } static int transmit(struct cflayer *layer, struct cfpkt *pkt) { int err, high = 0, qlen = 0; struct caif_device_entry *caifd = container_of(layer, struct caif_device_entry, layer); struct sk_buff *skb; struct netdev_queue *txq; rcu_read_lock_bh(); skb = cfpkt_tonative(pkt); skb->dev = caifd->netdev; skb_reset_network_header(skb); skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { struct Qdisc *sch; /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); sch = rcu_dereference_bh(txq->qdisc); if (likely(qdisc_is_empty(sch))) goto noxoff; /* can check for explicit qdisc len value only !NOLOCK, * always set flow off otherwise */ high = (caifd->netdev->tx_queue_len * q_high) / 100; if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } /* Hold lock while accessing xoff */ spin_lock_bh(&caifd->flow_lock); if (caifd->xoff) { spin_unlock_bh(&caifd->flow_lock); goto noxoff; } /* * Handle flow off, we do this by temporary hi-jacking this * skb's destructor function, and replace it with our own * flow-on callback. The callback will set flow-on and call * the original destructor. */ pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; spin_unlock_bh(&caifd->flow_lock); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); noxoff: rcu_read_unlock_bh(); err = dev_queue_xmit(skb); if (err > 0) err = -EIO; return err; } /* * Stuff received packets into the CAIF stack. * On error, returns non-zero and releases the skb. */ static int receive(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct cfpkt *pkt; struct caif_device_entry *caifd; int err; pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->receive || !netif_oper_up(caifd->netdev)) { rcu_read_unlock(); kfree_skb(skb); return NET_RX_DROP; } /* Hold reference to netdevice while using CAIF stack */ caifd_hold(caifd); rcu_read_unlock(); err = caifd->layer.up->receive(caifd->layer.up, pkt); /* For -EILSEQ the packet is not freed so free it now */ if (err == -EILSEQ) cfpkt_destroy(pkt); /* Release reference to stack upwards */ caifd_put(caifd); if (err != 0) err = NET_RX_DROP; return err; } static struct packet_type caif_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_CAIF), .func = receive, }; static void dev_flowctrl(struct net_device *dev, int on) { struct caif_device_entry *caifd; rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, on ? _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND : _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); caifd_put(caifd); } int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct caif_device_entry *caifd; enum cfcnfg_phy_preference pref; struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); struct caif_device_entry_list *caifdevs; int res; caifdevs = caif_device_list(dev_net(dev)); caifd = caif_device_alloc(dev); if (!caifd) return -ENOMEM; *layer = &caifd->layer; spin_lock_init(&caifd->flow_lock); switch (caifdev->link_select) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: pref = CFPHYPREF_HIGH_BW; break; } mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); strscpy(caifd->layer.name, dev->name, sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; res = cfcnfg_add_phy_layer(cfg, dev, &caifd->layer, pref, link_support, caifdev->use_fcs, head_room); mutex_unlock(&caifdevs->lock); if (rcv_func) *rcv_func = receive; return res; } EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; struct cflayer *layer, *link_support; int head_room = 0; struct caif_device_entry_list *caifdevs; int res; cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) return 0; switch (what) { case NETDEV_REGISTER: if (caifd != NULL) break; caifdev = netdev_priv(dev); link_support = NULL; if (caifdev->use_frag) { head_room = 1; link_support = cfserl_create(dev->ifindex, caifdev->use_stx); if (!link_support) { pr_warn("Out of memory\n"); break; } } res = caif_enroll_dev(dev, caifdev, link_support, head_room, &layer, NULL); if (res) cfserl_release(link_support); caifdev->flowctrl = dev_flowctrl; break; case NETDEV_UP: rcu_read_lock(); caifd = caif_get(dev); if (caifd == NULL) { rcu_read_unlock(); break; } caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); break; case NETDEV_DOWN: rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return -EINVAL; } cfcnfg_set_phy_state(cfg, &caifd->layer, false); caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_DOWN_IND, caifd->layer.id); spin_lock_bh(&caifd->flow_lock); /* * Replace our xoff-destructor with original destructor. * We trust that skb->destructor *always* is called before * the skb reference is invalid. The hijacked SKB destructor * takes the flow_lock so manipulating the skb->destructor here * should be safe. */ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; spin_unlock_bh(&caifd->flow_lock); caifd_put(caifd); break; case NETDEV_UNREGISTER: mutex_lock(&caifdevs->lock); caifd = caif_get(dev); if (caifd == NULL) { mutex_unlock(&caifdevs->lock); break; } list_del_rcu(&caifd->list); /* * NETDEV_UNREGISTER is called repeatedly until all reference * counts for the net-device are released. If references to * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for * the next call to NETDEV_UNREGISTER. * * If any packets are in flight down the CAIF Stack, * cfcnfg_del_phy_layer will return nonzero. * If no packets are in flight, the CAIF Stack associated * with the net-device un-registering is freed. */ if (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) { pr_info("Wait for device inuse\n"); /* Enrole device if CAIF Stack is still in use */ list_add_rcu(&caifd->list, &caifdevs->list); mutex_unlock(&caifdevs->lock); break; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); mutex_unlock(&caifdevs->lock); break; } return 0; } static struct notifier_block caif_device_notifier = { .notifier_call = caif_device_notify, .priority = 0, }; /* Per-namespace Caif devices handling */ static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); caifn->cfg = cfcnfg_create(); if (!caifn->cfg) return -ENOMEM; return 0; } static void caif_exit_net(struct net *net) { struct caif_device_entry *caifd, *tmp; struct caif_device_entry_list *caifdevs = caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); rtnl_lock(); mutex_lock(&caifdevs->lock); list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { int i = 0; list_del_rcu(&caifd->list); cfcnfg_set_phy_state(cfg, &caifd->layer, false); while (i < 10 && (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) { pr_info("Wait for device inuse\n"); msleep(250); i++; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); } cfcnfg_remove(cfg); mutex_unlock(&caifdevs->lock); rtnl_unlock(); } static struct pernet_operations caif_net_ops = { .init = caif_init_net, .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), }; /* Initialize Caif devices list */ static int __init caif_device_init(void) { int result; result = register_pernet_subsys(&caif_net_ops); if (result) return result; register_netdevice_notifier(&caif_device_notifier); dev_add_pack(&caif_packet_type); return result; } static void __exit caif_device_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); unregister_pernet_subsys(&caif_net_ops); } module_init(caif_device_init); module_exit(caif_device_exit);
616 616 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Implements 3 trace clock variants, with differing scalability/precision * tradeoffs: * * - local: CPU-local trace clock * - medium: scalable global clock with some jitter * - global: globally monotonic, serialized clock * * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> #include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. * * Useful for tracing that does not cross to other CPUs nor * does it go through idle events. */ u64 notrace trace_clock_local(void) { u64 clock; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ preempt_disable_notrace(); clock = sched_clock(); preempt_enable_notrace(); return clock; } EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of * jitter between CPUs. So it's a pretty scalable clock, but there * can be offsets in the trace data. */ u64 notrace trace_clock(void) { return local_clock(); } EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. * Note that this use of jiffies_64 is not completely safe on * 32-bit systems. But the window is tiny, and the effect if * we are affected is that we will have an obviously bogus * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock * * It has higher overhead than the other trace clocks but is still * an order of magnitude faster than GTOD derived hardware clocks. * * Used by plugins that need globally coherent timestamps. */ /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); /* * The global clock "guarantees" that the events are ordered * between CPUs. But if two events on two different CPUS call * trace_clock_global at roughly the same time, it really does * not matter which one gets the earlier time. Just make sure * that the same CPU will always show a monotonic clock. * * Use a read memory barrier to get the latest written * time that was recorded. */ smp_rmb(); prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) now = prev_time; /* * If in an NMI context then dont risk lockups and simply return * the current time. */ if (unlikely(in_nmi())) goto out; /* Tracing can cause strange recursion, always use a try lock */ if (arch_spin_trylock(&trace_clock_struct.lock)) { /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) now = prev_time; trace_clock_struct.prev_time = now; /* The unlock acts as the wmb for the above rmb */ arch_spin_unlock(&trace_clock_struct.lock); } out: raw_local_irq_restore(flags); return now; } EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; /* * trace_clock_counter(): simply an atomic counter. * Use the trace_counter "counter" for cases where you do not care * about timings, but are interested in strict ordering. */ u64 notrace trace_clock_counter(void) { return atomic64_add_return(1, &trace_counter); }
53 53 53 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2021-2023 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <net/mac80211.h> #include <asm/unaligned.h> #include "ieee80211_i.h" #include "rate.h" #include "mesh.h" #include "led.h" #include "wme.h" void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tmp; skb->pkt_type = IEEE80211_TX_STATUS_MSG; skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? &local->skb_queue : &local->skb_queue_unreliable, skb); tmp = skb_queue_len(&local->skb_queue) + skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. */ memset(&info->control, 0, sizeof(info->control)); info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->deflink.status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set * but later frames might time out so it might have to be * clear again ... It's all rather unlikely (this frame * should time out first, right?) but let's not confuse * peers unnecessarily. */ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); int tid = *p & IEEE80211_QOS_CTL_TID_MASK; /* * Clear EOSP if set, this could happen e.g. * if an absence period (us being a P2P GO) * shortens the SP. */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: * * (1) STA sends frame indicating it will go to sleep and does so * (2) hardware/firmware adds STA to filter list, passes frame up * (3) hardware/firmware processes TX fifo and suppresses a frame * (4) we get TX status before having processed the frame and * knowing that the STA has gone to sleep. * * This is actually quite unlikely even when both those events are * processed from interrupts coming in quickly after one another or * even at the same time because we queue both TX status events and * RX frames to be processed by a tasklet and process them in the * same order that they were received or TX status last. Hence, there * is no race as long as the frame RX is processed before the next TX * status, which drivers can ensure, see below. * * Note that this can only happen if the hardware or firmware can * actually add STAs to the filter list, if this is done by the * driver in response to set_tim() (which will only reduce the race * this whole filtering tries to solve, not completely solve it) * this situation cannot happen. * * To completely solve this race drivers need to make sure that they * (a) don't mix the irq-safe/not irq-safe TX status/RX processing * functions and * (b) always process RX events before TX status events if ordering * can be unknown, for example with different interrupt status * bits. * (c) if PS mode transitions are manual (i.e. the flag * %IEEE80211_HW_AP_LINK_PS is set), always process PS state * changes before calling TX status events if ordering can be * unknown. */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) { skb_queue_tail(&sta->tx_filtered[ac], skb); sta_info_recalc_tim(sta); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); return; } if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { /* Software retry the packet once */ info->flags |= IEEE80211_TX_INTFL_RETRIED; ieee80211_add_pending_skb(local, skb); return; } ps_dbg_ratelimited(sta->sdata, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || !tid_tx->bar_pending) return; tid_tx->bar_pending = false; ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn); } static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *) skb->data; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); u16 tid = qc[0] & 0xf; ieee80211_check_pending_bar(sta, hdr->addr1, tid); } } static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx) return; tid_tx->failed_bar_ssn = ssn; tid_tx->bar_pending = true; } static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, struct ieee80211_tx_status *status) { struct ieee80211_rate_status *status_rate = NULL; int len = sizeof(struct ieee80211_radiotap_header); if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; /* IEEE80211_RADIOTAP_RATE rate */ if (status_rate && !(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) len += 2; else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ len += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ len += 1; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (status_rate) { if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS) len += 3; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS) len = ALIGN(len, 2) + 12; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS) len = ALIGN(len, 2) + 12; } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) len = ALIGN(len, 2) + 12; } return len; } static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, int rtap_len, struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; struct ieee80211_rate_status *status_rate = NULL; unsigned char *pos; u16 legacy_rate = 0; u16 txflags; if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; rthdr = skb_push(skb, rtap_len); memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* * XXX: Once radiotap gets the bitmap reset thing the vendor * extensions proposal contains, we can actually report * the whole set of tries we did. */ /* IEEE80211_RADIOTAP_RATE */ if (status_rate) { if (!(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) legacy_rate = status_rate->rate_idx.legacy; } else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[info->band]; legacy_rate = sband->bitrates[info->status.rates[0].idx].bitrate; } if (legacy_rate) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5); /* padding for tx flags */ pos += 2; } /* IEEE80211_RADIOTAP_TX_FLAGS */ txflags = 0; if (!(info->flags & IEEE80211_TX_STAT_ACK) && !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); pos += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ /* for now report the total retry_count */ *pos = retry_count; pos++; if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS)) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (status_rate->rate_idx.bw == RATE_INFO_BW_40) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; pos[2] = status_rate->rate_idx.mcs; pos += 3; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS)) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_160: *pos = 11; break; case RATE_INFO_BW_80: *pos = 4; break; case RATE_INFO_BW_40: *pos = 1; break; default: *pos = 0; break; } pos++; /* u8 mcs_nss[4] */ *pos = (status_rate->rate_idx.mcs << 4) | status_rate->rate_idx.nss; pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); he = (struct ieee80211_radiotap_he *)pos; he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); #define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) he->data6 |= HE_PREP(DATA6_NSTS, status_rate->rate_idx.nss); #define CHECK_GI(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ (int)NL80211_RATE_INFO_HE_GI_##s) CHECK_GI(0_8); CHECK_GI(1_6); CHECK_GI(3_2); he->data3 |= HE_PREP(DATA3_DATA_MCS, status_rate->rate_idx.mcs); he->data3 |= HE_PREP(DATA3_DATA_DCM, status_rate->rate_idx.he_dcm); he->data5 |= HE_PREP(DATA5_GI, status_rate->rate_idx.he_gi); switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_20: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); break; case RATE_INFO_BW_40: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); break; case RATE_INFO_BW_80: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); break; case RATE_INFO_BW_160: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); break; case RATE_INFO_BW_HE_RU: #define CHECK_RU_ALLOC(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) CHECK_RU_ALLOC(26); CHECK_RU_ALLOC(52); CHECK_RU_ALLOC(106); CHECK_RU_ALLOC(242); CHECK_RU_ALLOC(484); CHECK_RU_ALLOC(996); CHECK_RU_ALLOC(2x996); he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, status_rate->rate_idx.he_ru_alloc + 4); break; default: WARN_ONCE(1, "Invalid SU BW %d\n", status_rate->rate_idx.bw); } pos += sizeof(struct ieee80211_radiotap_he); } if (status_rate || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD) pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF; pos[2] = info->status.rates[0].idx; pos += 3; } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) *pos = 1; else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) *pos = 4; else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) *pos = 11; else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */ *pos = 0; pos++; /* u8 mcs_nss[4] */ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) | ieee80211_rate_get_vht_nss(&info->status.rates[0]); pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } } /* * Handles the tx for TDLS teardown frames. * If the frame wasn't ACKed by the peer - it will be re-sent through the AP */ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 flags) { struct sk_buff *teardown_skb; struct sk_buff *orig_teardown_skb; bool is_teardown = false; /* Get the teardown data we need and free the lock */ spin_lock(&sdata->u.mgd.teardown_lock); teardown_skb = sdata->u.mgd.teardown_skb; orig_teardown_skb = sdata->u.mgd.orig_teardown_skb; if ((skb == orig_teardown_skb) && teardown_skb) { sdata->u.mgd.teardown_skb = NULL; sdata->u.mgd.orig_teardown_skb = NULL; is_teardown = true; } spin_unlock(&sdata->u.mgd.teardown_lock); if (is_teardown) { /* This mechanism relies on being able to get ACKs */ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { dev_kfree_skb_any(teardown_skb); } else { tdls_dbg(sdata, "TDLS Resending teardown through AP\n"); ieee80211_subif_start_xmit(teardown_skb, skb->dev); } } } static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; if (skb->dev == sdata->dev) return sdata; } return NULL; } return rcu_dereference(local->p2p_sdata); } static void ieee80211_report_ack_skb(struct ieee80211_local *local, struct sk_buff *orig_skb, bool acked, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(orig_skb); struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); skb = idr_remove(&local->ack_status_frames, info->status_data); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) return; if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; bool is_valid_ack_signal = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); struct cfg80211_tx_status status = { .cookie = cookie, .buf = skb->data, .len = skb->len, .ack = acked, }; if (ieee80211_is_timing_measurement(orig_skb) || ieee80211_is_ftm(orig_skb)) { status.tx_tstamp = ktime_to_ns(skb_hwtstamps(orig_skb)->hwtstamp); status.ack_tstamp = ktime_to_ns(ack_hwtstamp); } rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { if (skb->protocol == sdata->control_port_protocol || skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, is_valid_ack_signal, GFP_ATOMIC); else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status_ext(&sdata->wdev, &status, GFP_ATOMIC); else pr_warn("Unknown status report in ack skb\n"); } rcu_read_unlock(); dev_kfree_skb_any(skb); } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ skb_complete_wifi_ack(skb, acked); } } static void ieee80211_handle_smps_status(struct ieee80211_sub_if_data *sdata, bool acked, u16 status_data) { u16 sub_data = u16_get_bits(status_data, IEEE80211_STATUS_SUBDATA_MASK); enum ieee80211_smps_mode smps_mode = sub_data & 3; int link_id = (sub_data >> 2); struct ieee80211_link_data *link; if (!sdata || !ieee80211_sdata_running(sdata)) return; if (!acked) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; if (WARN(link_id >= ARRAY_SIZE(sdata->link), "bad SMPS status link: %d\n", link_id)) return; link = rcu_dereference(sdata->link[link_id]); if (!link) return; /* * This update looks racy, but isn't, the only other place * updating this variable is in managed mode before assoc, * and we have to be associated to have a status from the * action frame TX, since we cannot send it while we're not * associated yet. */ link->smps_mode = smps_mode; wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); } static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; if (tx_time_est) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); rcu_read_unlock(); } if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; } else if (!dropped) { /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb) == WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); } else if (ieee80211_s1g_is_twt_setup(skb)) { if (!acked) { struct sk_buff *qskb; qskb = skb_clone(skb, GFP_ATOMIC); if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); wiphy_work_queue(local->hw.wiphy, &sdata->work); } } } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); } } rcu_read_unlock(); } else if (info->status_data_idr) { ieee80211_report_ack_skb(local, skb, acked, dropped, ack_hwtstamp); } else if (info->status_data) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); switch (u16_get_bits(info->status_data, IEEE80211_STATUS_TYPE_MASK)) { case IEEE80211_STATUS_TYPE_SMPS: ieee80211_handle_smps_status(sdata, acked, info->status_data); break; } rcu_read_unlock(); } if (!dropped && skb->destructor) { skb->wifi_acked_valid = 1; skb->wifi_acked = acked; } ieee80211_led_tx(local); if (skb_has_frag_list(skb)) { kfree_skb_list(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } } /* * Use a static threshold for now, best value to be determined * by testing ... * Should it depend on: * - on # of retransmissions * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 #define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { unsigned long pkt_time = STA_LOST_PKT_TIME; unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; sta->deflink.status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; } /* * If we're in TDLS mode, make sure that all STA_LOST_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ if (sta->deflink.status_stats.lost_packets < pkt_thr || !time_after(jiffies, sta->deflink.status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, sta->deflink.status_stats.lost_packets, GFP_ATOMIC); sta->deflink.status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { int count = -1; int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) { /* just the first aggr frame carry status info */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } else if (info->status.rates[i].idx < 0) { break; } else if (i >= hw->max_report_rates) { /* the HW cannot have attempted that rate */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } count += info->status.rates[i].count; } if (count < 0) count = 0; *retry_count = count; return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, bool send_to_cooked, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; int rtap_len; /* send frame to monitor interfaces now */ rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, skb, retry_count, rtap_len, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (!ieee80211_sdata_running(sdata)) continue; if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; netif_rx(skb2); } } prev_dev = sdata->dev; } } if (prev_dev) { skb->dev = prev_dev; netif_rx(skb); skb = NULL; } rcu_read_unlock(); dev_kfree_skb(skb); } static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_status *status, int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; int tid = IEEE80211_NUM_TIDS; fc = hdr->frame_control; if (status->sta) { sta = container_of(status->sta, struct sta_info, sta); if (info->flags & IEEE80211_TX_STATUS_EOSP) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); /* mesh Peer Service Period support */ if (ieee80211_vif_is_mesh(&sta->sdata->vif) && ieee80211_is_data_qos(fc)) ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->deflink.tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { u16 ssn; u8 *qc; qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) & IEEE80211_SCTL_SEQ); ieee80211_send_bar(&sta->sdata->vif, hdr->addr1, tid, ssn); } else if (ieee80211_is_data_qos(fc)) { u8 *qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; } if (!acked && ieee80211_is_back_req(fc)) { u16 control; /* * BAR failed, store the last SSN and retry sending * the BAR when the next unicast transmission on the * same TID succeeds. */ bar = (struct ieee80211_bar *) skb->data; control = le16_to_cpu(bar->control); if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) { u16 ssn = le16_to_cpu(bar->start_seq_num); tid = (control & IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; ieee80211_set_bar_pending(sta, tid, ssn); } } if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) sta->deflink.status_stats.msdu_failed[tid]++; sta->deflink.status_stats.msdu_retries[tid] += retry_count; } if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); } /* SNMP counters * Fragments are passed to low-level drivers as separate skbs, so these * are actually fragments, not frames. Update frame counters only for * the first fragment of the frame. */ if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU * with an individual address in the address 1 field or an MPDU * with a multicast address in the address 1 field of type Data * or Management. */ if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); /* this was a transmitted frame, but now we want to reuse it */ skb_orphan(skb); /* Need to make a copy before skb->cb gets cleared */ send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || !(ieee80211_is_data(fc)); /* * This is a bit racy but we can avoid a lot of work * with this test... */ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); return; } /* send to monitor interfaces */ ieee80211_tx_monitor(local, skb, retry_count, send_to_cooked, status); } void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); if (sta) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status_skb); void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct sk_buff *skb = status->skb; struct sta_info *sta = NULL; int rates_idx, retry_count; bool acked, noack_success, ack_signal_valid; u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (status->n_rates) sta->deflink.tx_stats.last_rate_info = status->rates[status->n_rates - 1].rate_idx; } if (skb && (tx_time_est = ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { /* Do this here to avoid the expensive lookup of the sta * in ieee80211_report_used_skb(). */ ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); } if (!status->info) goto free; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); ack_signal_valid = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); if (pubsta) { struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) sta->deflink.status_stats.retry_failed++; sta->deflink.status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) ieee80211_sta_tx_notify(sdata, (void *) skb->data, acked, info->status.tx_time); if (acked) { sta->deflink.status_stats.last_ack = jiffies; if (sta->deflink.status_stats.lost_packets) sta->deflink.status_stats.lost_packets = 0; /* Track when last packet was ACKed */ sta->deflink.status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && unlikely(sdata->u.mgd.probe_send_count > 0)) sdata->u.mgd.probe_send_count = 0; if (ack_signal_valid) { sta->deflink.status_stats.last_ack_signal = (s8)info->status.ack_signal; sta->deflink.status_stats.ack_signal_filled = true; ewma_avg_signal_add(&sta->deflink.status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume * that this TX packet failed because of that. */ if (skb) ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { ieee80211_lost_packet(sta, info); } } rate_control_tx_status(local, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, status); } if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) return __ieee80211_tx_status(hw, status, rates_idx, retry_count); if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { I802_DEBUG_INC(local->dot11FailedCount); } free: if (!skb) return; ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info) { struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_status status = { .info = info, .sta = pubsta, }; rate_control_tx_status(local, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sta->deflink.tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, num_packets, GFP_ATOMIC); } EXPORT_SYMBOL(ieee80211_report_low_ack); void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); ktime_t kt = ktime_set(0, 0); ieee80211_report_used_skb(local, skb, true, kt); dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs) { struct sk_buff *skb; while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); }
11 11 8 3 6 6 6 6 5 6 2 3 3 3 1 1 1 1 1 3 3 3 3 3 3 3 1 1 1 1 1 7 4 4 9 2 10 1 11 11 11 11 11 16 5 11 11 11 11 4 4 8 4 7 2 9 10 1 11 11 67 67 66 1 51 11 60 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2007 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * */ #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4_extents.h" /* * The contiguous blocks details which can be * represented by a single extent */ struct migrate_struct { ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, struct migrate_struct *lb) { int retval = 0, needed; struct ext4_extent newext; struct ext4_ext_path *path; if (lb->first_pblock == 0) return 0; /* Add the extent to temp inode*/ newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); path = NULL; goto err_out; } /* * Calculate the credit needed to inserting this extent * Since we are doing this in loop we may accumulate extra * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } static int update_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* * See if we can add on to the existing range (if it exists) */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; lb->last_block = lb->curr_block; lb->curr_block++; return 0; } /* * Start a new range. */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; lb->first_block = lb->last_block = lb->curr_block; lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { lb->curr_block++; } } put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries; } } put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries * max_entries; } } put_bh(bh); return retval; } static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; int err; bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) { put_bh(bh); return err; } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_tind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i, retval = 0; __le32 *tmp_idata; struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { retval = free_dind_blocks(handle, inode, tmp_idata[i]); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) { int retval; /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ if (i_data[1]) { retval = free_dind_blocks(handle, inode, i_data[1]); if (retval) return retval; } /* ei->i_data[EXT4_TIND_BLOCK] */ if (i_data[2]) { retval = free_tind_blocks(handle, inode, i_data[2]); if (retval) return retval; } return 0; } static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); /* * One credit accounted for writing the * i_data field of the original inode */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; down_write(&EXT4_I(inode)->i_data_sem); /* * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across */ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* * Update i_blocks with the new blocks that got * allocated while adding extents for extent index * blocks. * * While converting to extents we need not * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); inode->i_blocks += tmp_inode->i_blocks; spin_unlock(&inode->i_lock); up_write(&EXT4_I(inode)->i_data_sem); /* * We mark the inode dirty after, because we decrement the * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); retval2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(retval2 && !retval)) retval = retval2; err_out: return retval; } static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_idx *ix) { int i, retval = 0; ext4_fsblk_t block; struct buffer_head *bh; struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); bh = ext4_sb_bread(inode->i_sb, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } /* * Free the extent meta data blocks only */ static int free_ext_block(handle_t *handle, struct inode *inode) { int i, retval = 0; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; struct ext4_extent_idx *ix; if (eh->eh_depth == 0) /* * No extra blocks allocated for extent meta data */ return 0; ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) return retval; } return retval; } int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; __le32 *i_data; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; int alloc_ctx; /* * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ if (!ext4_has_feature_extents(inode->i_sb) || ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || ext4_has_inline_data(inode)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) /* * don't migrate fast symlink */ return retval; alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_unlock; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); goto out_unlock; } /* * Use the correct seed for checksum (i.e. the seed from 'inode'). This * is so that the metadata blocks will have the correct checksum after * the migration. */ ei = EXT4_I(inode); tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_journal_stop(handle); /* * start with one credit accounted for * superblock modification. * * For the tmp_inode we already have committed the * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated * with i_data_sem held to prevent racing with block * allocation. */ down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_tmp_inode; } i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; } else lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } /* * Build the last extent */ retval = finish_range(handle, tmp_inode, &lb); err_out: if (retval) /* * Failure case delete the extent information with the * tmp_inode */ free_ext_block(handle, tmp_inode); else { retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); if (retval) /* * if we fail to swap inode data free the extent * details of the tmp inode */ free_ext_block(handle, tmp_inode); } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto out_stop; /* * Mark the tmp_inode as of size zero */ i_size_write(tmp_inode, 0); /* * set the i_blocks count to zero * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); out_stop: ext4_journal_stop(handle); out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } /* * Migrate a simple extent-based inode to use the i_blocks[] array */ int ext4_ind_migrate(struct inode *inode) { struct ext4_extent_header *eh; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (ext4_has_feature_bigalloc(inode->i_sb)) return -EOPNOTSUPP; /* * In order to get correct extent info, force all delayed allocation * blocks to be allocated, otherwise delayed allocation blocks may not * be reflected and bypass the checks on extent header. */ if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_unlock; } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); if (ret) goto errout; eh = ext_inode_hdr(inode); ex = EXT_FIRST_EXTENT(eh); if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { ret = -EOPNOTSUPP; goto errout; } if (eh->eh_entries == 0) blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); start = le32_to_cpu(ex->ee_block); end = start + len - 1; if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } } ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2 && !ret)) ret = ret2; errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; }
1 1 1 1 1 1 1 1 1 1 136 43 1 1 1 22 21 1 87 87 87 87 87 102 103 1 103 45 21 21 21 20 104 97 51 104 1 104 103 21 1 21 21 21 2 21 1 102 21 21 104 104 1 58 58 58 36 35 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 // SPDX-License-Identifier: GPL-2.0 #include <linux/minmax.h> #include "misc.h" #include "ctree.h" #include "space-info.h" #include "sysfs.h" #include "volumes.h" #include "free-space-cache.h" #include "ordered-data.h" #include "transaction.h" #include "block-group.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" /* * HOW DOES SPACE RESERVATION WORK * * If you want to know about delalloc specifically, there is a separate comment * for that with the delalloc code. This comment is about how the whole system * works generally. * * BASIC CONCEPTS * * 1) space_info. This is the ultimate arbiter of how much space we can use. * There's a description of the bytes_ fields with the struct declaration, * refer to that for specifics on each field. Suffice it to say that for * reservations we care about total_bytes - SUM(space_info->bytes_) when * determining if there is space to make an allocation. There is a space_info * for METADATA, SYSTEM, and DATA areas. * * 2) block_rsv's. These are basically buckets for every different type of * metadata reservation we have. You can see the comment in the block_rsv * code on the rules for each type, but generally block_rsv->reserved is how * much space is accounted for in space_info->bytes_may_use. * * 3) btrfs_calc*_size. These are the worst case calculations we used based * on the number of items we will want to modify. We have one for changing * items, and one for inserting new items. Generally we use these helpers to * determine the size of the block reserves, and then use the actual bytes * values to adjust the space_info counters. * * MAKING RESERVATIONS, THE NORMAL CASE * * We call into either btrfs_reserve_data_bytes() or * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with * num_bytes we want to reserve. * * ->reserve * space_info->bytes_may_reserve += num_bytes * * ->extent allocation * Call btrfs_add_reserved_bytes() which does * space_info->bytes_may_reserve -= num_bytes * space_info->bytes_reserved += extent_bytes * * ->insert reference * Call btrfs_update_block_group() which does * space_info->bytes_reserved -= extent_bytes * space_info->bytes_used += extent_bytes * * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) * * Assume we are unable to simply make the reservation because we do not have * enough space * * -> __reserve_bytes * create a reserve_ticket with ->bytes set to our reservation, add it to * the tail of space_info->tickets, kick async flush thread * * ->handle_reserve_ticket * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set * on the ticket. * * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space * Flushes various things attempting to free up space. * * -> btrfs_try_granting_tickets() * This is called by anything that either subtracts space from * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the * space_info->total_bytes. This loops through the ->priority_tickets and * then the ->tickets list checking to see if the reservation can be * completed. If it can the space is added to space_info->bytes_may_use and * the ticket is woken up. * * -> ticket wakeup * Check if ->bytes == 0, if it does we got our reservation and we can carry * on, if not return the appropriate error (ENOSPC, but can be EINTR if we * were interrupted.) * * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY * * Same as the above, except we add ourselves to the * space_info->priority_tickets, and we do not use ticket->wait, we simply * call flush_space() ourselves for the states that are safe for us to call * without deadlocking and hope for the best. * * THE FLUSHING STATES * * Generally speaking we will have two cases for each state, a "nice" state * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to * reduce the locking over head on the various trees, and even to keep from * doing any work at all in the case of delayed refs. Each of these delayed * things however hold reservations, and so letting them run allows us to * reclaim space so we can make new reservations. * * FLUSH_DELAYED_ITEMS * Every inode has a delayed item to update the inode. Take a simple write * for example, we would update the inode item at write time to update the * mtime, and then again at finish_ordered_io() time in order to update the * isize or bytes. We keep these delayed items to coalesce these operations * into a single operation done on demand. These are an easy way to reclaim * metadata space. * * FLUSH_DELALLOC * Look at the delalloc comment to get an idea of how much space is reserved * for delayed allocation. We can reclaim some of this space simply by * running delalloc, but usually we need to wait for ordered extents to * reclaim the bulk of this space. * * FLUSH_DELAYED_REFS * We have a block reserve for the outstanding delayed refs space, and every * delayed ref operation holds a reservation. Running these is a quick way * to reclaim space, but we want to hold this until the end because COW can * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when * our worst case reservations will likely never come true. * * RUN_DELAYED_IPUTS * If we're freeing inodes we're likely freeing checksums, file extent * items, and extent tree items. Loads of space could be freed up by these * operations, however they won't be usable until the transaction commits. * * COMMIT_TRANS * This will commit the transaction. Historically we had a lot of logic * surrounding whether or not we'd commit the transaction, but this waits born * out of a pre-tickets era where we could end up committing the transaction * thousands of times in a row without making progress. Now thanks to our * ticketing system we know if we're not making progress and can error * everybody out after a few commits rather than burning the disk hoping for * a different answer. * * OVERCOMMIT * * Because we hold so many reservations for metadata we will allow you to * reserve more space than is currently free in the currently allocate * metadata space. This only happens with metadata, data does not allow * overcommitting. * * You can see the current logic for when we allow overcommit in * btrfs_can_overcommit(), but it only applies to unallocated space. If there * is no unallocated space to be had, all reservations are kept within the * free space in the allocated metadata chunks. * * Because of overcommitting, you generally want to use the * btrfs_can_overcommit() logic for metadata allocations, as it does the right * thing with or without extra unallocated space. */ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); return s_info->bytes_used + s_info->bytes_reserved + s_info->bytes_pinned + s_info->bytes_readonly + s_info->bytes_zone_unusable + (may_use_included ? s_info->bytes_may_use : 0); } /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. */ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; struct btrfs_space_info *found; list_for_each_entry(found, head, list) found->full = 0; } /* * Block groups with more than this value (percents) of unusable space will be * scheduled for background reclaim. */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) #define BTRFS_DYNAMIC_RECLAIM_THRESH_MAX (90) /* * Calculate chunk size depending on volume type (regular or zoned). */ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) { if (btrfs_is_zoned(fs_info)) return fs_info->zone_size; ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return SZ_32M; /* Handle BTRFS_BLOCK_GROUP_METADATA */ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) return SZ_1G; return SZ_256M; } /* * Update default chunk size. */ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size) { WRITE_ONCE(space_info->chunk_size, chunk_size); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) { struct btrfs_space_info *space_info; int i; int ret; space_info = kzalloc(sizeof(*space_info), GFP_NOFS); if (!space_info) return -ENOMEM; space_info->fs_info = info; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); spin_lock_init(&space_info->lock); space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; INIT_LIST_HEAD(&space_info->ro_bgs); INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) return ret; list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; return ret; } int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *disk_super; u64 features; u64 flags; int mixed = 0; int ret; disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; ret = create_space_info(fs_info, flags); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); } else { flags = BTRFS_BLOCK_GROUP_METADATA; ret = create_space_info(fs_info, flags); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); } out: return ret; } void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { struct btrfs_space_info *found; int factor, index; factor = btrfs_bg_type_to_factor(block_group->flags); found = btrfs_find_space_info(info, block_group->flags); ASSERT(found); spin_lock(&found->lock); found->total_bytes += block_group->length; found->disk_total += block_group->length * factor; found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; found->bytes_zone_unusable += block_group->zone_unusable; if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); block_group->space_info = found; index = btrfs_bg_flags_to_raid_index(block_group->flags); down_write(&found->groups_sem); list_add_tail(&block_group->list, &found->block_groups[index]); up_write(&found->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags) { struct list_head *head = &info->space_info; struct btrfs_space_info *found; flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; list_for_each_entry(found, head, list) { if (found->flags & flags) return found; } return NULL; } static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; u64 data_chunk_size; int factor; if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free * space is actually usable. For raid56, the space info used * doesn't include the parity drive, so we don't have to * change the math */ factor = btrfs_bg_type_to_factor(profile); avail = div_u64(avail, factor); if (avail == 0) return 0; /* * Calculate the data_chunk_size, space_info->chunk_size is the * "optimal" chunk size based on the fs size. However when we actually * allocate the chunk we will strip this down further, making it no more * than 10% of the disk or 1G, whichever is smaller. */ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); data_chunk_size = min(data_sinfo->chunk_size, mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); /* * Since data allocations immediately use block groups as part of the * reservation, because we assume that data reservations will == actual * usage, we could potentially overcommit and then immediately have that * available space used by a data allocation, which could put us in a * bind when we get close to filling the file system. * * To handle this simply remove the data_chunk_size from the available * space. If we are relatively empty this won't affect our ability to * overcommit much, and if we're very close to full it'll keep us from * getting into a position where we've given ourselves very little * metadata wiggle room. */ if (avail <= data_chunk_size) return 0; avail -= data_chunk_size; /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) avail >>= 3; else avail >>= 1; return avail; } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; u64 used; /* Don't overcommit when in mixed mode */ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; used = btrfs_space_info_used(space_info, true); avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; return 0; } static void remove_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { if (!list_empty(&ticket->list)) { list_del_init(&ticket->list); ASSERT(space_info->reclaim_size >= ticket->bytes); space_info->reclaim_size -= ticket->bytes; } } /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { struct list_head *head; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; lockdep_assert_held(&space_info->lock); head = &space_info->priority_tickets; again: while (!list_empty(head)) { struct reserve_ticket *ticket; u64 used = btrfs_space_info_used(space_info, true); ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); } else { break; } } if (head == &space_info->priority_tickets) { head = &space_info->tickets; flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } } #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ do { \ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ spin_lock(&__rsv->lock); \ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ __rsv->size, __rsv->reserved); \ spin_unlock(&__rsv->lock); \ } while (0) static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) { switch (space_info->flags) { case BTRFS_BLOCK_GROUP_SYSTEM: return "SYSTEM"; case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "DATA+METADATA"; case BTRFS_BLOCK_GROUP_DATA: return "DATA"; case BTRFS_BLOCK_GROUP_METADATA: return "METADATA"; default: return "UNKNOWN"; } } static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) { DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", flag_str, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly, info->bytes_zone_unusable); } void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups) { struct btrfs_block_group *cache; u64 total_avail = 0; int index = 0; spin_lock(&info->lock); __btrfs_dump_space_info(fs_info, info); dump_global_block_rsv(fs_info); spin_unlock(&info->lock); if (!dump_block_groups) return; down_read(&info->groups_sem); again: list_for_each_entry(cache, &info->block_groups[index], list) { u64 avail; spin_lock(&cache->lock); avail = cache->length - cache->used - cache->pinned - cache->reserved - cache->delalloc_bytes - cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, cache->reserved, cache->delalloc_bytes, cache->bytes_super, cache->zone_unusable, avail, cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); total_avail += avail; } if (++index < BTRFS_NR_RAID_TYPES) goto again; up_read(&info->groups_sem); btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); } static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; u64 nr; bytes = btrfs_calc_insert_metadata_size(fs_info, 1); nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; return nr; } #define EXTENT_SIZE_PER_ITEM SZ_256K /* * shrink metadata reservation for delalloc */ static void shrink_delalloc(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 to_reclaim, bool wait_ordered, bool for_preempt) { struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 ordered_bytes; u64 items; long time_left; int loops; delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); if (delalloc_bytes == 0 && ordered_bytes == 0) return; /* Calc the number of the pages we need flush for space reservation */ if (to_reclaim == U64_MAX) { items = U64_MAX; } else { /* * to_reclaim is set to however much metadata we need to * reclaim, but reclaiming that much data doesn't really track * exactly. What we really want to do is reclaim full inode's * worth of reservations, however that's not available to us * here. We will take a fraction of the delalloc bytes for our * flushing loops and hope for the best. Delalloc will expand * the amount we write to cover an entire dirty extent, which * will reclaim the metadata reservation for that range. If * it's not enough subsequent flush stages will be more * aggressive. */ to_reclaim = max(to_reclaim, delalloc_bytes >> 3); items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; } trans = current->journal_info; /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ if (ordered_bytes > delalloc_bytes && !for_preempt) wait_ordered = true; loops = 0; while ((delalloc_bytes || ordered_bytes) && loops < 3) { u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; long nr_pages = min_t(u64, temp, LONG_MAX); int async_pages; btrfs_start_delalloc_roots(fs_info, nr_pages, true); /* * We need to make sure any outstanding async pages are now * processed before we continue. This is because things like * sync_inode() try to be smart and skip writing if the inode is * marked clean. We don't use filemap_fwrite for flushing * because we want to control how many pages we write out at a * time, thus this is the only safe way to make sure we've * waited for outstanding compressed workers to have started * their jobs and thus have ordered extents set up properly. * * This exists because we do not want to wait for each * individual inode to finish its async work, we simply want to * start the IO on everybody, and then come back here and wait * for all of the async work to catch up. Once we're done with * that we know we'll have ordered extents for everything and we * can decide if we wait for that or not. * * If we choose to replace this in the future, make absolutely * sure that the proper waiting is being done in the async case, * as there have been bugs in that area before. */ async_pages = atomic_read(&fs_info->async_delalloc_pages); if (!async_pages) goto skip_async; /* * We don't want to wait forever, if we wrote less pages in this * loop than we have outstanding, only wait for that number of * pages, otherwise we can wait for all async pages to finish * before continuing. */ if (async_pages > nr_pages) async_pages -= nr_pages; else async_pages = 0; wait_event(fs_info->async_submit_wait, atomic_read(&fs_info->async_delalloc_pages) <= async_pages); skip_async: loops++; if (wait_ordered && !trans) { btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) break; } /* * If we are for preemption we just want a one-shot of delalloc * flushing so we can stop flushing if we decide we don't need * to anymore. */ if (for_preempt) break; spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { spin_unlock(&space_info->lock); break; } spin_unlock(&space_info->lock); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); ordered_bytes = percpu_counter_sum_positive( &fs_info->ordered_bytes); } } /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the * state of @space_info to detect the outcome. */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, enum btrfs_flush_state state, bool for_preempt) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; switch (state) { case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: if (state == FLUSH_DELAYED_ITEMS_NR) nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; else nr = -1; trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); if (ret == -ENOENT) ret = 0; break; } ret = btrfs_run_delayed_items_nr(trans, nr); btrfs_end_transaction(trans); break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: case FLUSH_DELALLOC_FULL: if (state == FLUSH_DELALLOC_FULL) num_bytes = U64_MAX; shrink_delalloc(fs_info, space_info, num_bytes, state != FLUSH_DELALLOC, for_preempt); break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); if (ret == -ENOENT) ret = 0; break; } if (state == FLUSH_DELAYED_REFS_NR) btrfs_run_delayed_refs(trans, num_bytes); else btrfs_run_delayed_refs(trans, 0); btrfs_end_transaction(trans); break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); break; } ret = btrfs_chunk_alloc(trans, btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; break; case RUN_DELAYED_IPUTS: /* * If we have pending delayed iputs then we could free up a * bunch of pinned space, so make sure we run the iputs before * we do our pinned bytes check below. */ btrfs_run_delayed_iputs(fs_info); btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: ASSERT(current->journal_info == NULL); /* * We don't want to start a new transaction, just attach to the * current one or wait it fully commits in case its commit is * happening at the moment. Note: we don't use a nostart join * because that does not wait for a transaction to fully commit * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); if (ret == -ENOENT) ret = 0; break; } ret = btrfs_commit_transaction(trans); break; default: ret = -ENOSPC; break; } trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, ret, for_preempt); return; } static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { u64 used; u64 avail; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); avail = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); used = btrfs_space_info_used(space_info, true); /* * We may be flushing because suddenly we have less space than we had * before, and now we're well over-committed based on our current free * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ if (space_info->total_bytes + avail < used) to_reclaim += used - (space_info->total_bytes + avail); return to_reclaim; } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; u64 thresh; u64 used; thresh = mult_perc(space_info->total_bytes, 90); lockdep_assert_held(&space_info->lock); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) return false; used = space_info->bytes_may_use + space_info->bytes_pinned; /* The total flushable belongs to the global rsv, don't flush. */ if (global_rsv_size >= used) return false; /* * 128MiB is 1/4 of the maximum global rsv size. If we have less than * that devoted to other reservations then there's no sense in flushing, * we don't have a lot of things that need flushing. */ if (used - global_rsv_size <= SZ_128M) return false; /* * We have tickets queued, bail so we don't compete with the async * flushers. */ if (space_info->reclaim_size) return false; /* * If we have over half of the free space occupied by reservations or * pinned then we want to start flushing. * * We do not do the traditional thing here, which is to say * * if (used >= ((total_bytes + avail) / 2)) * return 1; * * because this doesn't quite work how we want. If we had more than 50% * of the space_info used by bytes_used and we had 0 available we'd just * constantly run the background flusher. Instead we want it to kick in * if our reclaimable space exceeds our clamped free space. * * Our clamping range is 2^1 -> 2^8. Practically speaking that means * the following: * * Amount of RAM Minimum threshold Maximum threshold * * 256GiB 1GiB 128GiB * 128GiB 512MiB 64GiB * 64GiB 256MiB 32GiB * 32GiB 128MiB 16GiB * 16GiB 64MiB 8GiB * * These are the range our thresholds will fall in, corresponding to how * much delalloc we need for the background flusher to kick in. */ thresh = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; if (used < space_info->total_bytes) thresh += space_info->total_bytes - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; /* * If we have more ordered bytes than delalloc bytes then we're either * doing a lot of DIO, or we simply don't have a lot of delalloc waiting * around. Preemptive flushing is only useful in that it can free up * space before tickets need to wait for things to finish. In the case * of ordered extents, preemptively waiting on ordered extents gets us * nothing, if our reservations are tied up in ordered extents we'll * simply have to slow down writers by forcing them to wait on ordered * extents. * * In the case that ordered is larger than delalloc, only include the * block reserves that we would actually be able to directly reclaim * from. In this case if we're heavy on metadata operations this will * clearly be heavy enough to warrant preemptive flushing. In the case * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. * * We want to make sure we truly are maxed out on ordered however, so * cut ordered in half, and if it's still higher than delalloc then we * can keep flushing. This is to avoid the case where we start * flushing, and now delalloc == ordered and we stop preemptively * flushing when we could still have several gigs of delalloc to flush. */ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; else used += space_info->bytes_may_use - global_rsv_size; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; if (!ticket->steal) return false; if (global_rsv->space_info != space_info) return false; spin_lock(&global_rsv->lock); min_bytes = mult_perc(global_rsv->size, 10); if (global_rsv->reserved < min_bytes + ticket->bytes) { spin_unlock(&global_rsv->lock); return false; } global_rsv->reserved -= ticket->bytes; remove_ticket(space_info, ticket); ticket->bytes = 0; wake_up(&ticket->wait); space_info->tickets_id++; if (global_rsv->reserved < global_rsv->size) global_rsv->full = 0; spin_unlock(&global_rsv->lock); return true; } /* * We've exhausted our flushing, start failing tickets. * * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * * We call this when we've exhausted our flushing ability and haven't made * progress in satisfying tickets. The reservation code handles tickets in * order, so if there is a large ticket first and then smaller ones we could * very well satisfy the smaller tickets. This will attempt to wake up any * tickets in the list to catch this case. * * This function returns true if it was able to make progress by clearing out * other tickets, or if it stumbles across a ticket that was smaller than the * first ticket. */ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; const bool aborted = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); __btrfs_dump_space_info(fs_info, space_info); } while (!list_empty(&space_info->tickets) && tickets_id == space_info->tickets_id) { ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) return true; if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); remove_ticket(space_info, ticket); if (aborted) ticket->error = -EIO; else ticket->error = -ENOSPC; wake_up(&ticket->wait); /* * We're just throwing tickets away, so more flushing may not * trip over btrfs_try_granting_tickets, so we need to call it * here to see if we can make progress with the next ticket in * the list. */ if (!aborted) btrfs_try_granting_tickets(fs_info, space_info); } return (tickets_id != space_info->tickets_id); } /* * This is for normal flushers, we can wait all goddamned day if we want to. We * will loop and continuously try to flush as long as we are making progress. * We count progress as clearing off tickets each time we have to loop. */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); return; } last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; do { flush_space(fs_info, space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; spin_unlock(&space_info->lock); return; } to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; } /* * We do not want to empty the system of delalloc unless we're * under heavy pressure, so allow one trip through the flushing * logic before we start doing a FLUSH_DELALLOC_FULL. */ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) flush_state++; /* * We don't want to force a chunk allocation until we've tried * pretty hard to reclaim space. Think of the case where we * freed up a bunch of space and so have a lot of pinned space * to reclaim. We would rather use that than possibly create a * underutilized metadata chunk. So if this is our first run * through the flushing state machine skip ALLOC_CHUNK_FORCE and * commit the transaction. If nothing has changed the next go * around then we can force a chunk allocation. */ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { space_info->flush = 0; } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; } } spin_unlock(&space_info->lock); } while (flush_state <= COMMIT_TRANS); } /* * This handles pre-flushing of metadata space before we get to the point that * we need to start blocking threads on tickets. The logic here is different * from the other flush paths because it doesn't rely on tickets to tell us how * much we need to flush, instead it attempts to keep us below the 80% full * watermark of space by flushing whichever reservation pool is currently the * largest. */ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) { struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; struct btrfs_block_rsv *delayed_block_rsv; struct btrfs_block_rsv *delayed_refs_rsv; struct btrfs_block_rsv *global_rsv; struct btrfs_block_rsv *trans_rsv; int loops = 0; fs_info = container_of(work, struct btrfs_fs_info, preempt_reclaim_work); space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delayed_block_rsv = &fs_info->delayed_block_rsv; delayed_refs_rsv = &fs_info->delayed_refs_rsv; global_rsv = &fs_info->global_block_rsv; trans_rsv = &fs_info->trans_block_rsv; spin_lock(&space_info->lock); while (need_preemptive_reclaim(fs_info, space_info)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; u64 global_rsv_size = global_rsv->reserved; loops++; /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting * out the block rsv's space from the bytes_may_use. If that * amount is higher than the individual reserves, then we can * assume it's tied up in delalloc reservations. */ block_rsv_size = global_rsv_size + delayed_block_rsv->reserved + delayed_refs_rsv->reserved + trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; /* * We don't want to include the global_rsv in our calculation, * because that's space we can't touch. Subtract it from the * block_rsv_size for the next checks. */ block_rsv_size -= global_rsv_size; /* * We really want to avoid flushing delalloc too much, as it * could result in poor allocation patterns, so only flush it if * it's larger than the rest of the pools combined. */ if (delalloc_size > block_rsv_size) { to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; } else if (space_info->bytes_pinned > (delayed_block_rsv->reserved + delayed_refs_rsv->reserved)) { to_reclaim = space_info->bytes_pinned; flush = COMMIT_TRANS; } else if (delayed_block_rsv->reserved > delayed_refs_rsv->reserved) { to_reclaim = delayed_block_rsv->reserved; flush = FLUSH_DELAYED_ITEMS_NR; } else { to_reclaim = delayed_refs_rsv->reserved; flush = FLUSH_DELAYED_REFS_NR; } spin_unlock(&space_info->lock); /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, * reclaim 1 items worth. */ to_reclaim >>= 2; if (!to_reclaim) to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); flush_space(fs_info, space_info, to_reclaim, flush, true); cond_resched(); spin_lock(&space_info->lock); } /* We only went through once, back off our clamping. */ if (loops == 1 && !space_info->reclaim_size) space_info->clamp = max(1, space_info->clamp - 1); trace_btrfs_done_preemptive_reclaim(fs_info, space_info); spin_unlock(&space_info->lock); } /* * FLUSH_DELALLOC_WAIT: * Space is freed from flushing delalloc in one of two ways. * * 1) compression is on and we allocate less space than we reserved * 2) we are overwriting existing space * * For #1 that extra space is reclaimed as soon as the delalloc pages are * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent * length to ->bytes_reserved, and subtracts the reserved space from * ->bytes_may_use. * * For #2 this is trickier. Once the ordered extent runs we will drop the * extent in the range we are overwriting, which creates a delayed ref for * that freed extent. This however is not reclaimed until the transaction * commits, thus the next stages. * * RUN_DELAYED_IPUTS * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not * immediately re-usable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS * This is where we reclaim all of the pinned space generated by running the * iputs * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, * so if we now have space to allocate do the force chunk allocation. */ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, ALLOC_CHUNK_FORCE, }; static void btrfs_async_reclaim_data_space(struct work_struct *work) { struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 last_tickets_id; enum btrfs_flush_state flush_state = 0; fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); space_info = fs_info->data_sinfo; spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; spin_unlock(&space_info->lock); return; } last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); while (!space_info->full) { flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; spin_unlock(&space_info->lock); return; } /* Something happened, fail everything and bail. */ if (BTRFS_FS_ERROR(fs_info)) goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); } while (flush_state < ARRAY_SIZE(data_flush_states)) { flush_space(fs_info, space_info, U64_MAX, data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; spin_unlock(&space_info->lock); return; } if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { last_tickets_id = space_info->tickets_id; flush_state = 0; } if (flush_state >= ARRAY_SIZE(data_flush_states)) { if (space_info->full) { if (maybe_fail_all_tickets(fs_info, space_info)) flush_state = 0; else space_info->flush = 0; } else { flush_state = 0; } /* Something happened, fail everything and bail. */ if (BTRFS_FS_ERROR(fs_info)) goto aborted_fs; } spin_unlock(&space_info->lock); } return; aborted_fs: maybe_fail_all_tickets(fs_info, space_info); space_info->flush = 0; spin_unlock(&space_info->lock); } void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); INIT_WORK(&fs_info->preempt_reclaim_work, btrfs_preempt_reclaim_metadata_space); } static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, ALLOC_CHUNK, }; static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, FLUSH_DELAYED_REFS_NR, FLUSH_DELAYED_REFS, FLUSH_DELALLOC, FLUSH_DELALLOC_WAIT, FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, const enum btrfs_flush_state *states, int states_nr) { u64 to_reclaim; int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } while (flush_state < states_nr) { spin_unlock(&space_info->lock); flush_space(fs_info, space_info, to_reclaim, states[flush_state], false); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } } /* * Attempt to steal from the global rsv if we can, except if the fs was * turned into error mode due to a transaction abort when flushing space * above, in that case fail with the abort error instead of returning * success to the caller if we can steal from the global rsv - this is * just to have caller fail immeditelly instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ if (BTRFS_FS_ERROR(fs_info)) { ticket->error = BTRFS_FS_ERROR(fs_info); remove_ticket(space_info, ticket); } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { ticket->error = -ENOSPC; remove_ticket(space_info, ticket); } /* * We must run try_granting_tickets here because we could be a large * ticket in front of a smaller ticket that can now be satisfied with * the available space. */ btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { spin_lock(&space_info->lock); /* We could have been granted before we got here. */ if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } while (!space_info->full) { spin_unlock(&space_info->lock); flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } } ticket->error = -ENOSPC; remove_ticket(space_info, ticket); btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { DEFINE_WAIT(wait); int ret = 0; spin_lock(&space_info->lock); while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); if (ret) { /* * Delete us from the list. After we unlock the space * info, we don't want the async reclaim job to reserve * space for this ticket. If that would happen, then the * ticket's task would not known that space was reserved * despite getting an error, resulting in a space leak * (bytes_may_use counter of our space_info). */ remove_ticket(space_info, ticket); ticket->error = -EINTR; break; } spin_unlock(&space_info->lock); schedule(); finish_wait(&ticket->wait, &wait); spin_lock(&space_info->lock); } spin_unlock(&space_info->lock); } /* * Do the appropriate flushing and waiting for a ticket. * * @fs_info: the filesystem * @space_info: space info for the reservation * @ticket: ticket for the reservation * @start_ns: timestamp when the reservation started * @orig_bytes: amount of bytes originally reserved * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. */ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; switch (flush) { case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, priority_flush_states, ARRAY_SIZE(priority_flush_states)); break; case BTRFS_RESERVE_FLUSH_EVICT: priority_reclaim_metadata_space(fs_info, space_info, ticket, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: priority_reclaim_data_space(fs_info, space_info, ticket); break; default: ASSERT(0); break; } ret = ticket->error; ASSERT(list_empty(&ticket->list)); /* * Check that we can't have an error set if the reservation succeeded, * as that would confuse tasks and lead them to error out without * releasing reserved space (if an error happens the expectation is that * space wasn't reserved at all). */ ASSERT(!(ticket->bytes == 0 && ticket->error)); trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, start_ns, flush, ticket->error); return ret; } /* * This returns true if this flush state will go through the ordinary flushing * code. */ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) { return (flush == BTRFS_RESERVE_FLUSH_ALL) || (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); /* * If we're heavy on ordered operations then clamping won't help us. We * need to clamp specifically to keep up with dirty'ing buffered * writers, because there's not a 1:1 correlation of writing delalloc * and freeing space, like there is with flushing delayed refs or * delayed nodes. If we're already more ordered than delalloc then * we're keeping up, otherwise we aren't and should probably clamp. */ if (ordered < delalloc) space_info->clamp = min(space_info->clamp + 1, 8); } static inline bool can_steal(enum btrfs_reserve_flush_enum flush) { return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || flush == BTRFS_RESERVE_FLUSH_EVICT); } /* * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to * fail as quickly as possible. */ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) { return (flush != BTRFS_RESERVE_NO_FLUSH && flush != BTRFS_RESERVE_FLUSH_EMERGENCY); } /* * Try to reserve bytes from the block_rsv's space. * * @fs_info: the filesystem * @space_info: space info we want to allocate from * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to * regain reservations will be made and this will fail if there is not enough * space already. */ static int __reserve_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { struct work_struct *async_work; struct reserve_ticket ticket; u64 start_ns = 0; u64 used; int ret = -ENOSPC; bool pending_tickets; ASSERT(orig_bytes); /* * If have a transaction handle (current->journal_info != NULL), then * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those * flushing methods can trigger transaction commits. */ if (current->journal_info) { /* One assert per line for easier debugging. */ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); } if (flush == BTRFS_RESERVE_FLUSH_DATA) async_work = &fs_info->async_data_reclaim_work; else async_work = &fs_info->async_reclaim_work; spin_lock(&space_info->lock); used = btrfs_space_info_used(space_info, true); /* * We don't want NO_FLUSH allocations to jump everybody, they can * generally handle ENOSPC in a different way, so treat them the same as * normal flushers when it comes to skipping pending tickets. */ if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) pending_tickets = !list_empty(&space_info->tickets) || !list_empty(&space_info->priority_tickets); else pending_tickets = !list_empty(&space_info->priority_tickets); /* * Carry on if we have enough space (short-circuit) OR call * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; } /* * Things are dire, we need to make a reservation so we don't abort. We * will let this reservation go through as long as we have actual space * left to allocate for the block. */ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; } } /* * If we couldn't make a reservation then setup our reservation ticket * and kick the async worker if it's not already running. * * If we are a priority flusher then we just need to add our ticket to * the list and we will do our own flushing further down. */ if (ret && can_ticket(flush)) { ticket.bytes = orig_bytes; ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { /* * We were forced to add a reserve ticket, so * our preemptive flushing is unable to keep * up. Clamp down on the threshold for the * preemptive flushing in order to keep up with * the workload. */ maybe_clamp_preempt(fs_info, space_info); space_info->flush = 1; trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "enospc"); queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, &space_info->priority_tickets); } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { /* * We will do the space reservation dance during log replay, * which means we won't have fs_info->fs_root set, so don't do * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && !work_busy(&fs_info->preempt_reclaim_work) && need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, &fs_info->preempt_reclaim_work); } } spin_unlock(&space_info->lock); if (!ret || !can_ticket(flush)) return ret; return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, orig_bytes, flush); } /* * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to * regain reservations will be made and this will fail if there is not enough * space already. */ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); } return ret; } /* * Try to reserve data bytes for an allocation. * * @fs_info: the filesystem * @bytes: number of bytes we need * @flush: how we are allowed to flush * * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. */ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", data_sinfo->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); } return ret; } /* Dump all the space infos when we abort a transaction due to ENOSPC. */ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; btrfs_info(fs_info, "dumping space info:"); list_for_each_entry(space_info, &fs_info->space_info, list) { spin_lock(&space_info->lock); __btrfs_dump_space_info(fs_info, space_info); spin_unlock(&space_info->lock); } dump_global_block_rsv(fs_info); } /* * Account the unused space of all the readonly block group in the space_info. * takes mirrors into account. */ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group *block_group; u64 free_bytes = 0; int factor; /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; spin_lock(&sinfo->lock); list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); if (!block_group->ro) { spin_unlock(&block_group->lock); continue; } factor = btrfs_bg_type_to_factor(block_group->flags); free_bytes += (block_group->length - block_group->used) * factor; spin_unlock(&block_group->lock); } spin_unlock(&sinfo->lock); return free_bytes; } static u64 calc_pct_ratio(u64 x, u64 y) { int err; if (!y) return 0; again: err = check_mul_overflow(100, x, &x); if (err) goto lose_precision; return div64_u64(x, y); lose_precision: x >>= 10; y >>= 10; if (!y) y = 1; goto again; } /* * The dynamic threshold formula is: * (unused / allocated) * (unused / unallocated) or equivalently * unused^2 / (allocated * unallocated) * * The fundamental goal of automatic reclaim is to protect the filesystem's * unallocated space and thus minimize the probability of the filesystem going * read only when a metadata allocation failure causes a transaction abort. * * However, relocations happen into the space_info's unused space, therefore * automatic reclaim must also back off as that space runs low. There is no * value in doing trivial "relocations" of re-writing the same block group * into a fresh one. * * unused / allocated sets a baseline, very conservative threshold which * properly goes to 0 as unused goes to a small portion of the allocated space. * * On its own, this would likely do very little reclaim, so include * unused / unallocated (which can be greatly in excess of 100%) to bias heavily * towards reclaim when unallocated goes low or unused goes high. */ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) { struct btrfs_fs_info *fs_info = space_info->fs_info; u64 unalloc = atomic64_read(&fs_info->free_chunk_space); u64 alloc = space_info->total_bytes; u64 used = btrfs_space_info_used(space_info, false); u64 unused = alloc - used; /* unused <= alloc; clamped to 100 */ int unused_pct = calc_pct_ratio(unused, alloc); u64 unused_unalloc_ratio = calc_pct_ratio(unused, unalloc); int err; u64 thresh; err = check_mul_overflow(unused_pct, unused_unalloc_ratio, &thresh); if (err) return BTRFS_DYNAMIC_RECLAIM_THRESH_MAX; /* Both quantities are percentages; remove the squared factor of 100. */ thresh = div64_u64(thresh, 100); return clamp_val(thresh, 0, BTRFS_DYNAMIC_RECLAIM_THRESH_MAX); } int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) { lockdep_assert_held(&space_info->lock); if (READ_ONCE(space_info->dynamic_reclaim)) return calc_dynamic_reclaim_threshold(space_info); return READ_ONCE(space_info->bg_reclaim_threshold); } /* * Under "urgent" reclaim, we will reclaim even fresh block groups that have * recently seen successful allocations, as we are desperate to reclaim * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. */ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) { struct btrfs_fs_info *fs_info = space_info->fs_info; u64 unalloc = atomic64_read(&fs_info->free_chunk_space); u64 chunk_size = min(READ_ONCE(space_info->chunk_size), SZ_1G); return unalloc < chunk_size; } static int do_reclaim_sweep(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; bool try_again = true; bool urgent; spin_lock(&space_info->lock); if (space_info->periodic_reclaim_ready) { space_info->periodic_reclaim_ready = false; } else { spin_unlock(&space_info->lock); return 0; } urgent = is_reclaim_urgent(space_info); thresh_pct = btrfs_calc_reclaim_threshold(space_info); spin_unlock(&space_info->lock); down_read(&space_info->groups_sem); again: list_for_each_entry(bg, &space_info->block_groups[raid], list) { u64 thresh; bool reclaim = false; btrfs_get_block_group(bg); spin_lock(&bg->lock); thresh = mult_perc(bg->length, thresh_pct); if (bg->used < thresh && bg->reclaim_mark) { try_again = false; reclaim = true; } bg->reclaim_mark++; spin_unlock(&bg->lock); if (reclaim) btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); } /* * In situations where we are very motivated to reclaim (low unalloc) * use two passes to make the reclaim mark check best effort. * * If we have any staler groups, we don't touch the fresher ones, but if we * really need a block group, do take a fresh one. */ if (try_again && urgent) { try_again = false; goto again; } up_read(&space_info->groups_sem); return 0; } int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) { int ret; int raid; struct btrfs_space_info *space_info; list_for_each_entry(space_info, &fs_info->space_info, list) { if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) continue; if (!READ_ONCE(space_info->periodic_reclaim)) continue; for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { ret = do_reclaim_sweep(fs_info, space_info, raid); if (ret) return ret; } } return ret; }
527 527 527 25 527 3 3 3 3 25 293 191 312 313 312 313 312 191 25 312 310 237 18 219 191 191 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/percpu-refcount.h> /* * Initially, a percpu refcount is just a set of percpu counters. Initially, we * don't try to detect the ref hitting 0 - which means that get/put can just * increment or decrement the local counter. Note that the counter on a * particular cpu can (and will) wrap - this is fine, when we go to shutdown the * percpu counters will all sum to the correct value * * (More precisely: because modular arithmetic is commutative the sum of all the * percpu_count vars will be equal to what it would have been if all the gets * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect * the ref hitting 0 on every put - this would require global synchronization * and defeat the whole purpose of using percpu refs. * * What we do is require the user to keep track of the initial refcount; we know * the ref can't hit 0 before the user drops the initial ref, so as long as we * convert to non percpu mode before the initial ref is dropped everything * works. * * Converting to non percpu mode is done with some RCUish stuff in * percpu_ref_kill. Additionally, we need a bias value so that the * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * * Initializes @ref. @ref starts out in percpu mode with a refcount of 1 unless * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD. These flags * change the start state to atomic with the latter setting the initial refcount * to 0. See the definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); unsigned long start_count = 0; struct percpu_ref_data *data; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; data = kzalloc(sizeof(*ref->data), gfp); if (!data) { free_percpu((void __percpu *)ref->percpu_count_ptr); ref->percpu_count_ptr = 0; return -ENOMEM; } data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT; if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) { ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; data->allow_reinit = true; } else { start_count += PERCPU_COUNT_BIAS; } if (flags & PERCPU_REF_INIT_DEAD) ref->percpu_count_ptr |= __PERCPU_REF_DEAD; else start_count++; atomic_long_set(&data->count, start_count); data->release = release; data->confirm_switch = NULL; data->ref = ref; ref->data = data; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); static void __percpu_ref_exit(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { /* non-NULL confirm_switch indicates switching in progress */ WARN_ON_ONCE(ref->data && ref->data->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } /** * percpu_ref_exit - undo percpu_ref_init() * @ref: percpu_ref to exit * * This function exits @ref. The caller is responsible for ensuring that * @ref is no longer in active use. The usual places to invoke this * function from are the @ref->release() callback or in init failure path * where percpu_ref_init() succeeded but other parts of the initialization * of the embedding object failed. */ void percpu_ref_exit(struct percpu_ref *ref) { struct percpu_ref_data *data = ref->data; unsigned long flags; __percpu_ref_exit(ref); if (!data) return; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) << __PERCPU_REF_FLAG_BITS; ref->data = NULL; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); kfree(data); } EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; data->confirm_switch(ref); data->confirm_switch = NULL; wake_up_all(&percpu_ref_switch_waitq); if (!data->allow_reinit) __percpu_ref_exit(ref); /* drop ref from percpu_ref_switch_to_atomic() */ percpu_ref_put(ref); } static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; unsigned long __percpu *percpu_count = percpu_count_ptr(ref); static atomic_t underflows; unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); pr_debug("global %lu percpu %lu\n", atomic_long_read(&data->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum * to &ref->count; since gets could be happening on one cpu while puts * happen on another, adding a single cpu's count could cause * @ref->count to hit 0 before we've got a consistent value - but the * sum of all the counts will be consistent and correct. * * Subtracting the bias value then has to happen _after_ adding count to * &ref->count; we need the bias value to prevent &ref->count from * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count); if (WARN_ONCE(atomic_long_read(&data->count) <= 0, "percpu ref (%ps) <= 0 (%ld) after switching to atomic", data->release, atomic_long_read(&data->count)) && atomic_inc_return(&underflows) < 4) { pr_err("%s(): percpu_ref underflow", __func__); mem_dump_obj(data); } /* @ref is viewed as dead on all CPUs, send out switch confirmation */ percpu_ref_call_confirm_rcu(rcu); } static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) { } static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { if (confirm_switch) confirm_switch(ref); return; } /* switching from percpu to atomic */ ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; /* * Non-NULL ->confirm_switch is used to indicate that switching is * in progress. Use noop one if unspecified. */ ref->data->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; percpu_ref_get(ref); /* put after confirmation */ call_rcu_hurry(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; BUG_ON(!percpu_count); if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; if (WARN_ON_ONCE(!ref->data->allow_reinit)) return; atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count); /* * Restore per-cpu operation. smp_store_release() is paired * with READ_ONCE() in __ref_is_percpu() and guarantees that the * zeroing is visible to all percpu accesses which can see the * following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } static void __percpu_ref_switch_mode(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { struct percpu_ref_data *data = ref->data; lockdep_assert_held(&percpu_ref_switch_lock); /* * If the previous ATOMIC switching hasn't finished yet, wait for * its completion. If the caller ensures that ATOMIC switching * isn't in progress, this function can be called from any context. */ wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch, percpu_ref_switch_lock); if (data->force_atomic || percpu_ref_is_dying(ref)) __percpu_ref_switch_to_atomic(ref, confirm_switch); else __percpu_ref_switch_to_percpu(ref); } /** * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * @confirm_switch: optional confirmation callback * * There's no reason to use this function for the usual reference counting. * Use percpu_ref_kill[_and_confirm](). * * Schedule switching of @ref to atomic mode. All its percpu counts will * be collected to the main atomic counter. On completion, when all CPUs * are guaraneed to be in atomic mode, @confirm_switch, which may not * block, is invoked. This function may be invoked concurrently with all * the get/put operations and can safely be mixed with kill and reinit * operations. Note that @ref will stay in atomic mode across kill/reinit * cycles until percpu_ref_switch_to_percpu() is called. * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = true; __percpu_ref_switch_mode(ref, confirm_switch); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); /** * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * * Schedule switching the ref to atomic mode, and wait for the * switch to complete. Caller must ensure that no other thread * will switch back to percpu mode. */ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref) { percpu_ref_switch_to_atomic(ref, NULL); wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode * * There's no reason to use this function for the usual reference counting. * To re-use an expired ref, use percpu_ref_reinit(). * * Switch @ref to percpu mode. This function may be invoked concurrently * with all the get/put operations and can safely be mixed with kill and * reinit operations. This function reverses the sticky atomic state set * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = false; __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation * @ref: percpu_ref to kill * @confirm_kill: optional confirmation callback * * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be * called after @ref is seen as dead from all CPUs at which point all * further invocations of percpu_ref_tryget_live() will fail. See * percpu_ref_tryget_live() for details. * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). * * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ONCE(percpu_ref_is_dying(ref), "%s called more than once on %ps!", __func__, ref->data->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); /** * percpu_ref_is_zero - test whether a percpu refcount reached zero * @ref: percpu_ref to test * * Returns %true if @ref reached zero. * * This function is safe to call as long as @ref is between init and exit. */ bool percpu_ref_is_zero(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long count, flags; if (__ref_is_percpu(ref, &percpu_count)) return false; /* protect us from being destroyed */ spin_lock_irqsave(&percpu_ref_switch_lock, flags); if (ref->data) count = atomic_long_read(&ref->data->count); else count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); return count == 0; } EXPORT_SYMBOL_GPL(percpu_ref_is_zero); /** * percpu_ref_reinit - re-initialize a percpu refcount * @ref: perpcu_ref to re-initialize * * Re-initialize @ref so that it's in the same state as when it finished * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been * initialized successfully and reached 0 but not exited. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_reinit(struct percpu_ref *ref) { WARN_ON_ONCE(!percpu_ref_is_zero(ref)); percpu_ref_resurrect(ref); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); /** * percpu_ref_resurrect - modify a percpu refcount from dead to live * @ref: perpcu_ref to resurrect * * Modify @ref so that it's in the same state as before percpu_ref_kill() was * called. @ref must be dead but must not yet have exited. * * If @ref->release() frees @ref then the caller is responsible for * guaranteeing that @ref->release() does not get called while this * function is in progress. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_resurrect(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ON_ONCE(!percpu_ref_is_dying(ref)); WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
54 66 79 89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM migrate #if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MIGRATE_H #include <linux/tracepoint.h> #define MIGRATE_MODE \ EM( MIGRATE_ASYNC, "MIGRATE_ASYNC") \ EM( MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT") \ EMe(MIGRATE_SYNC, "MIGRATE_SYNC") #define MIGRATE_REASON \ EM( MR_COMPACTION, "compaction") \ EM( MR_MEMORY_FAILURE, "memory_failure") \ EM( MR_MEMORY_HOTPLUG, "memory_hotplug") \ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ EM( MR_LONGTERM_PIN, "longterm_pin") \ EMe(MR_DEMOTION, "demotion") /* * First define the enums in the above macros to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); MIGRATE_MODE MIGRATE_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} TRACE_EVENT(mm_migrate_pages, TP_PROTO(unsigned long succeeded, unsigned long failed, unsigned long thp_succeeded, unsigned long thp_failed, unsigned long thp_split, unsigned long large_folio_split, enum migrate_mode mode, int reason), TP_ARGS(succeeded, failed, thp_succeeded, thp_failed, thp_split, large_folio_split, mode, reason), TP_STRUCT__entry( __field( unsigned long, succeeded) __field( unsigned long, failed) __field( unsigned long, thp_succeeded) __field( unsigned long, thp_failed) __field( unsigned long, thp_split) __field( unsigned long, large_folio_split) __field( enum migrate_mode, mode) __field( int, reason) ), TP_fast_assign( __entry->succeeded = succeeded; __entry->failed = failed; __entry->thp_succeeded = thp_succeeded; __entry->thp_failed = thp_failed; __entry->thp_split = thp_split; __entry->large_folio_split = large_folio_split; __entry->mode = mode; __entry->reason = reason; ), TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu nr_split=%lu mode=%s reason=%s", __entry->succeeded, __entry->failed, __entry->thp_succeeded, __entry->thp_failed, __entry->thp_split, __entry->large_folio_split, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); TRACE_EVENT(mm_migrate_pages_start, TP_PROTO(enum migrate_mode mode, int reason), TP_ARGS(mode, reason), TP_STRUCT__entry( __field(enum migrate_mode, mode) __field(int, reason) ), TP_fast_assign( __entry->mode = mode; __entry->reason = reason; ), TP_printk("mode=%s reason=%s", __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); DECLARE_EVENT_CLASS(migration_pte, TP_PROTO(unsigned long addr, unsigned long pte, int order), TP_ARGS(addr, pte, order), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, pte) __field(int, order) ), TP_fast_assign( __entry->addr = addr; __entry->pte = pte; __entry->order = order; ), TP_printk("addr=%lx, pte=%lx order=%d", __entry->addr, __entry->pte, __entry->order) ); DEFINE_EVENT(migration_pte, set_migration_pte, TP_PROTO(unsigned long addr, unsigned long pte, int order), TP_ARGS(addr, pte, order) ); DEFINE_EVENT(migration_pte, remove_migration_pte, TP_PROTO(unsigned long addr, unsigned long pte, int order), TP_ARGS(addr, pte, order) ); #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
27 3 9 18 1 13 13 3 3 8 1 1 2 2 2 7 5 1 1 15 15 25 26 26 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 /* * Resizable simple ram filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * * Usage limits added by David Gibson, Linuxcare Australia. * This file is released under the GPL. */ /* * NOTE! This filesystem is probably most useful * not as a real filesystem, but as an example of * how virtual filesystems can be written. * * It doesn't get much simpler than this. Consider * that this file implements the full semantics of * a POSIX-compliant read-write filesystem. * * Note in particular how the filesystem does not * need to implement any data structures of its own * to keep track of the virtual data: using the VFS * caches is sufficient. */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/seq_file.h> #include "internal.h" struct ramfs_mount_opts { umode_t mode; }; struct ramfs_fs_info { struct ramfs_mount_opts mount_opts; }; #define RAMFS_DEFAULT_MODE 0755 static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { struct inode * inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); simple_inode_init_ts(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &ramfs_file_inode_operations; inode->i_fop = &ramfs_file_operations; break; case S_IFDIR: inode->i_op = &ramfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); break; } } return inode; } /* * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ static int ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; if (inode) { error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { iput(inode); goto out; } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } out: return error; } static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; int error = -ENOSPC; inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { iput(inode); goto out; } error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); dget(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } else iput(inode); } out: return error; } static int ramfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; error = security_inode_init_security(inode, dir, &file_dentry(file)->d_name, NULL, NULL); if (error) { iput(inode); goto out; } d_tmpfile(file, inode); out: return finish_open_simple(file, error); } static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, .link = simple_link, .unlink = simple_unlink, .symlink = ramfs_symlink, .mkdir = ramfs_mkdir, .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, .tmpfile = ramfs_tmpfile, }; /* * Display the mount options in /proc/mounts. */ static int ramfs_show_options(struct seq_file *m, struct dentry *root) { struct ramfs_fs_info *fsi = root->d_sb->s_fs_info; if (fsi->mount_opts.mode != RAMFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", fsi->mount_opts.mode); return 0; } static const struct super_operations ramfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = ramfs_show_options, }; enum ramfs_param { Opt_mode, }; const struct fs_parameter_spec ramfs_fs_parameters[] = { fsparam_u32oct("mode", Opt_mode), {} }; static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; struct ramfs_fs_info *fsi = fc->s_fs_info; int opt; opt = fs_parse(fc, ramfs_fs_parameters, param, &result); if (opt == -ENOPARAM) { opt = vfs_parse_fs_param_source(fc, param); if (opt != -ENOPARAM) return opt; /* * We might like to report bad mount options here; * but traditionally ramfs has ignored all mount options, * and as it is used as a !CONFIG_SHMEM simple substitute * for tmpfs, better continue to ignore other mount options. */ return 0; } if (opt < 0) return opt; switch (opt) { case Opt_mode: fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; break; } return 0; } static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ramfs_fs_info *fsi = sb->s_fs_info; struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RAMFS_MAGIC; sb->s_op = &ramfs_ops; sb->s_time_gran = 1; inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; return 0; } static int ramfs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, ramfs_fill_super); } static void ramfs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations ramfs_context_ops = { .free = ramfs_free_fc, .parse_param = ramfs_parse_param, .get_tree = ramfs_get_tree, }; int ramfs_init_fs_context(struct fs_context *fc) { struct ramfs_fs_info *fsi; fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->mount_opts.mode = RAMFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &ramfs_context_ops; return 0; } void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); kill_litter_super(sb); } static struct file_system_type ramfs_fs_type = { .name = "ramfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } fs_initcall(init_ramfs_fs);
26 26 16 25 1 28 2 25 1 6 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/textsearch.c Generic text search interface * * Authors: Thomas Graf <tgraf@suug.ch> * Pablo Neira Ayuso <pablo@netfilter.org> * * ========================================================================== */ /** * DOC: ts_intro * INTRODUCTION * * The textsearch infrastructure provides text searching facilities for * both linear and non-linear data. Individual search algorithms are * implemented in modules and chosen by the user. * * ARCHITECTURE * * .. code-block:: none * * User * +----------------+ * | finish()|<--------------(6)-----------------+ * |get_next_block()|<--------------(5)---------------+ | * | | Algorithm | | * | | +------------------------------+ * | | | init() find() destroy() | * | | +------------------------------+ * | | Core API ^ ^ ^ * | | +---------------+ (2) (4) (8) * | (1)|----->| prepare() |---+ | | * | (3)|----->| find()/next() |-----------+ | * | (7)|----->| destroy() |----------------------+ * +----------------+ +---------------+ * * (1) User configures a search by calling textsearch_prepare() specifying * the search parameters such as the pattern and algorithm name. * (2) Core requests the algorithm to allocate and initialize a search * configuration according to the specified parameters. * (3) User starts the search(es) by calling textsearch_find() or * textsearch_next() to fetch subsequent occurrences. A state variable * is provided to the algorithm to store persistent variables. * (4) Core eventually resets the search offset and forwards the find() * request to the algorithm. * (5) Algorithm calls get_next_block() provided by the user continuously * to fetch the data to be searched in block by block. * (6) Algorithm invokes finish() after the last call to get_next_block * to clean up any leftovers from get_next_block. (Optional) * (7) User destroys the configuration by calling textsearch_destroy(). * (8) Core notifies the algorithm to destroy algorithm specific * allocations. (Optional) * * USAGE * * Before a search can be performed, a configuration must be created * by calling textsearch_prepare() specifying the searching algorithm, * the pattern to look for and flags. As a flag, you can set TS_IGNORECASE * to perform case insensitive matching. But it might slow down * performance of algorithm, so you should use it at own your risk. * The returned configuration may then be used for an arbitrary * amount of times and even in parallel as long as a separate struct * ts_state variable is provided to every instance. * * The actual search is performed by either calling * textsearch_find_continuous() for linear data or by providing * an own get_next_block() implementation and * calling textsearch_find(). Both functions return * the position of the first occurrence of the pattern or UINT_MAX if * no match was found. Subsequent occurrences can be found by calling * textsearch_next() regardless of the linearity of the data. * * Once you're done using a configuration it must be given back via * textsearch_destroy. * * EXAMPLE:: * * int pos; * struct ts_config *conf; * struct ts_state state; * const char *pattern = "chicken"; * const char *example = "We dance the funky chicken"; * * conf = textsearch_prepare("kmp", pattern, strlen(pattern), * GFP_KERNEL, TS_AUTOLOAD); * if (IS_ERR(conf)) { * err = PTR_ERR(conf); * goto errout; * } * * pos = textsearch_find_continuous(conf, &state, example, strlen(example)); * if (pos != UINT_MAX) * panic("Oh my god, dancing chickens at %d\n", pos); * * textsearch_destroy(conf); */ /* ========================================================================== */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/err.h> #include <linux/textsearch.h> #include <linux/slab.h> static LIST_HEAD(ts_ops); static DEFINE_SPINLOCK(ts_mod_lock); static inline struct ts_ops *lookup_ts_algo(const char *name) { struct ts_ops *o; rcu_read_lock(); list_for_each_entry_rcu(o, &ts_ops, list) { if (!strcmp(name, o->name)) { if (!try_module_get(o->owner)) o = NULL; rcu_read_unlock(); return o; } } rcu_read_unlock(); return NULL; } /** * textsearch_register - register a textsearch module * @ops: operations lookup table * * This function must be called by textsearch modules to announce * their presence. The specified &@ops must have %name set to a * unique identifier and the callbacks find(), init(), get_pattern(), * and get_pattern_len() must be implemented. * * Returns 0 or -EEXISTS if another module has already registered * with same name. */ int textsearch_register(struct ts_ops *ops) { int err = -EEXIST; struct ts_ops *o; if (ops->name == NULL || ops->find == NULL || ops->init == NULL || ops->get_pattern == NULL || ops->get_pattern_len == NULL) return -EINVAL; spin_lock(&ts_mod_lock); list_for_each_entry(o, &ts_ops, list) { if (!strcmp(ops->name, o->name)) goto errout; } list_add_tail_rcu(&ops->list, &ts_ops); err = 0; errout: spin_unlock(&ts_mod_lock); return err; } EXPORT_SYMBOL(textsearch_register); /** * textsearch_unregister - unregister a textsearch module * @ops: operations lookup table * * This function must be called by textsearch modules to announce * their disappearance for examples when the module gets unloaded. * The &ops parameter must be the same as the one during the * registration. * * Returns 0 on success or -ENOENT if no matching textsearch * registration was found. */ int textsearch_unregister(struct ts_ops *ops) { int err = 0; struct ts_ops *o; spin_lock(&ts_mod_lock); list_for_each_entry(o, &ts_ops, list) { if (o == ops) { list_del_rcu(&o->list); goto out; } } err = -ENOENT; out: spin_unlock(&ts_mod_lock); return err; } EXPORT_SYMBOL(textsearch_unregister); struct ts_linear_state { unsigned int len; const void *data; }; static unsigned int get_linear_data(unsigned int consumed, const u8 **dst, struct ts_config *conf, struct ts_state *state) { struct ts_linear_state *st = (struct ts_linear_state *) state->cb; if (likely(consumed < st->len)) { *dst = st->data + consumed; return st->len - consumed; } return 0; } /** * textsearch_find_continuous - search a pattern in continuous/linear data * @conf: search configuration * @state: search state * @data: data to search in * @len: length of data * * A simplified version of textsearch_find() for continuous/linear data. * Call textsearch_next() to retrieve subsequent matches. * * Returns the position of first occurrence of the pattern or * %UINT_MAX if no occurrence was found. */ unsigned int textsearch_find_continuous(struct ts_config *conf, struct ts_state *state, const void *data, unsigned int len) { struct ts_linear_state *st = (struct ts_linear_state *) state->cb; conf->get_next_block = get_linear_data; st->data = data; st->len = len; return textsearch_find(conf, state); } EXPORT_SYMBOL(textsearch_find_continuous); /** * textsearch_prepare - Prepare a search * @algo: name of search algorithm * @pattern: pattern data * @len: length of pattern * @gfp_mask: allocation mask * @flags: search flags * * Looks up the search algorithm module and creates a new textsearch * configuration for the specified pattern. * * Note: The format of the pattern may not be compatible between * the various search algorithms. * * Returns a new textsearch configuration according to the specified * parameters or a ERR_PTR(). If a zero length pattern is passed, this * function returns EINVAL. */ struct ts_config *textsearch_prepare(const char *algo, const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int err = -ENOENT; struct ts_config *conf; struct ts_ops *ops; if (len == 0) return ERR_PTR(-EINVAL); ops = lookup_ts_algo(algo); #ifdef CONFIG_MODULES /* * Why not always autoload you may ask. Some users are * in a situation where requesting a module may deadlock, * especially when the module is located on a NFS mount. */ if (ops == NULL && flags & TS_AUTOLOAD) { request_module("ts_%s", algo); ops = lookup_ts_algo(algo); } #endif if (ops == NULL) goto errout; conf = ops->init(pattern, len, gfp_mask, flags); if (IS_ERR(conf)) { err = PTR_ERR(conf); goto errout; } conf->ops = ops; return conf; errout: if (ops) module_put(ops->owner); return ERR_PTR(err); } EXPORT_SYMBOL(textsearch_prepare); /** * textsearch_destroy - destroy a search configuration * @conf: search configuration * * Releases all references of the configuration and frees * up the memory. */ void textsearch_destroy(struct ts_config *conf) { if (conf->ops) { if (conf->ops->destroy) conf->ops->destroy(conf); module_put(conf->ops->owner); } kfree(conf); } EXPORT_SYMBOL(textsearch_destroy);
29 3 57 2 2 2 2 2 77 51 335 286 165 124 201 202 1011 1190 1191 16 550 664 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP router. * * Version: @(#)route.h 1.0.4 05/27/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Fixes: * Alan Cox : Reformatted. Added ip_rt_local() * Alan Cox : Support for TCP parameters. * Alexey Kuznetsov: Major changes for new routing code. * Mike McLagan : Routing by source * Robert Olsson : Added rt_cache statistics */ #ifndef _ROUTE_H #define _ROUTE_H #include <net/dst.h> #include <net/inetpeer.h> #include <net/flow.h> #include <net/inet_sock.h> #include <net/ip_fib.h> #include <net/arp.h> #include <net/ndisc.h> #include <linux/in_route.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/route.h> #include <linux/ip.h> #include <linux/cache.h> #include <linux/security.h> #define RTO_ONLINK 0x01 static inline __u8 ip_sock_rt_scope(const struct sock *sk) { if (sock_flag(sk, SOCK_LOCALROUTE)) return RT_SCOPE_LINK; return RT_SCOPE_UNIVERSE; } static inline __u8 ip_sock_rt_tos(const struct sock *sk) { return RT_TOS(READ_ONCE(inet_sk(sk)->tos)); } struct ip_tunnel_info; struct fib_nh; struct fib_info; struct uncached_list; struct rtable { struct dst_entry dst; int rt_genid; unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; __u8 rt_uses_gateway; int rt_iif; u8 rt_gw_family; /* Info on neighbour */ union { __be32 rt_gw4; struct in6_addr rt_gw6; }; /* Miscellaneous cached information */ u32 rt_mtu_locked:1, rt_pmtu:31; }; static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_is_input != 0; } static inline bool rt_is_output_route(const struct rtable *rt) { return rt->rt_is_input == 0; } static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) { if (rt->rt_gw_family == AF_INET) return rt->rt_gw4; return daddr; } struct ip_rt_acct { __u32 o_bytes; __u32 o_packets; __u32 i_bytes; __u32 i_packets; }; struct rt_cache_stat { unsigned int in_slow_tot; unsigned int in_slow_mc; unsigned int in_no_route; unsigned int in_brd; unsigned int in_martian_dst; unsigned int in_martian_src; unsigned int out_slow_tot; unsigned int out_slow_mc; }; extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, const struct sk_buff *skb); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, struct fib_result *res, const struct sk_buff *skb); static inline struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp) { return ip_route_output_key_hash(net, flp, NULL); } struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, const struct sock *sk); struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp) { return ip_route_output_flow(net, flp, NULL); } static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, __be32 saddr, u8 tos, int oif) { struct flowi4 fl4 = { .flowi4_oif = oif, .flowi4_tos = tos, .daddr = daddr, .saddr = saddr, }; return ip_route_output_key(net, &fl4); } static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4, const struct sock *sk, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) { flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, __be32 daddr, __be32 saddr, __be32 gre_key, __u8 tos, int oif) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = oif; fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; fl4->flowi4_proto = IPPROTO_GRE; fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin); int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) { int err; rcu_read_lock(); err = ip_route_input_noref(skb, dst, src, tos, devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) err = -EINVAL; } rcu_read_unlock(); return err; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol); void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt); struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric); void rt_add_uncached_list(struct rtable *rt); void rt_del_uncached_list(struct rtable *rt); int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags); static inline void ip_rt_put(struct rtable *rt) { /* dst_release() accepts a NULL parameter. * We rely on dst being first structure in struct rtable */ BUILD_BUG_ON(offsetof(struct rtable, dst) != 0); dst_release(&rt->dst); } #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) extern const __u8 ip_tos2prio[16]; static inline char rt_tos2priority(u8 tos) { return ip_tos2prio[IPTOS_TOS(tos)>>1]; } /* ip_route_connect() and ip_route_newports() work in tandem whilst * binding a socket for a new outgoing connection. * * In order to use IPSEC properly, we must, in the end, have a * route that was looked up using all available keys including source * and destination ports. * * However, if a source port needs to be allocated (the user specified * a wildcard source port) we need to obtain addressing information * in order to perform that allocation. * * So ip_route_connect() looks up a route using wildcarded source and * destination ports in the key, simply so that we can get a pair of * addresses to use for port allocation. * * Later, once the ports are allocated, ip_route_newports() will make * another route lookup if needed to make sure we catch any IPSEC * rules keyed on the port information. * * The callers allocate the flow key on their stack, and must pass in * the same flowi4 object to both the ip_route_connect() and the * ip_route_newports() calls. */ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, const struct sock *sk) { __u8 flow_flags = 0; if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, const struct sock *sk) { struct net *net = sock_net(sk); struct rtable *rt; ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk); if (!dst || !src) { rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) return rt; ip_rt_put(rt); flowi4_update_output(fl4, oif, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, __be16 orig_sport, __be16 orig_dport, __be16 sport, __be16 dport, const struct sock *sk) { if (sport != orig_sport || dport != orig_dport) { fl4->fl4_dport = dport; fl4->fl4_sport = sport; ip_rt_put(rt); flowi4_update_output(fl4, sk->sk_bound_dev_if, fl4->daddr, fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); } return rt; } static inline int inet_iif(const struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); if (rt && rt->rt_iif) return rt->rt_iif; return skb->skb_iif; } static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); struct net *net = dev_net(dst->dev); if (hoplimit == 0) hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); return hoplimit; } static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, __be32 daddr) { struct neighbour *neigh; neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &daddr, dev, false); return neigh; } static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt, struct sk_buff *skb, bool *is_v6gw) { struct net_device *dev = rt->dst.dev; struct neighbour *neigh; if (likely(rt->rt_gw_family == AF_INET)) { neigh = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &rt->rt_gw6); *is_v6gw = true; } else { neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr); } return neigh; } #endif /* _ROUTE_H */
11 11 11 11 11 11 11 11 11 11 11 573 1187 546 1186 1187 546 1337 1338 181 1338 1335 1338 1337 1335 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio SCSI HBA driver * * Copyright IBM Corp. 2010 * Copyright Red Hat, Inc. 2011 * * Authors: * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> * Paolo Bonzini <pbonzini@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/interrupt.h> #include <linux/virtio.h> #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include <linux/virtio_scsi.h> #include <linux/cpu.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <scsi/scsi_host.h> #include <scsi/scsi_device.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_tcq.h> #include <scsi/scsi_devinfo.h> #include <linux/seqlock.h> #include <linux/blk-mq-virtio.h> #include "sd.h" #define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_EVENT_LEN 8 #define VIRTIO_SCSI_VQ_BASE 2 static unsigned int virtscsi_poll_queues; module_param(virtscsi_poll_queues, uint, 0644); MODULE_PARM_DESC(virtscsi_poll_queues, "The number of dedicated virtqueues for polling I/O"); /* Command queue element */ struct virtio_scsi_cmd { struct scsi_cmnd *sc; struct completion *comp; union { struct virtio_scsi_cmd_req cmd; struct virtio_scsi_cmd_req_pi cmd_pi; struct virtio_scsi_ctrl_tmf_req tmf; struct virtio_scsi_ctrl_an_req an; } req; union { struct virtio_scsi_cmd_resp cmd; struct virtio_scsi_ctrl_tmf_resp tmf; struct virtio_scsi_ctrl_an_resp an; struct virtio_scsi_event evt; } resp; } ____cacheline_aligned_in_smp; struct virtio_scsi_event_node { struct virtio_scsi *vscsi; struct virtio_scsi_event event; struct work_struct work; }; struct virtio_scsi_vq { /* Protects vq */ spinlock_t vq_lock; struct virtqueue *vq; }; /* Driver instance state */ struct virtio_scsi { struct virtio_device *vdev; /* Get some buffers ready for event vq */ struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; u32 num_queues; int io_queues[HCTX_MAX_TYPES]; struct hlist_node node; /* Protected by event_vq lock */ bool stop_events; struct virtio_scsi_vq ctrl_vq; struct virtio_scsi_vq event_vq; struct virtio_scsi_vq req_vqs[]; }; static struct kmem_cache *virtscsi_cmd_cache; static mempool_t *virtscsi_cmd_pool; static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev) { return vdev->priv; } static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) { if (resid) scsi_set_resid(sc, min(resid, scsi_bufflen(sc))); } /* * virtscsi_complete_cmd - finish a scsi_cmd and invoke scsi_done * * Called with vq_lock held. */ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; struct scsi_cmnd *sc = cmd->sc; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; dev_dbg(&sc->device->sdev_gendev, "cmd %p response %u status %#02x sense_len %u\n", sc, resp->response, resp->status, resp->sense_len); sc->result = resp->status; virtscsi_compute_resid(sc, virtio32_to_cpu(vscsi->vdev, resp->resid)); switch (resp->response) { case VIRTIO_SCSI_S_OK: set_host_byte(sc, DID_OK); break; case VIRTIO_SCSI_S_OVERRUN: set_host_byte(sc, DID_ERROR); break; case VIRTIO_SCSI_S_ABORTED: set_host_byte(sc, DID_ABORT); break; case VIRTIO_SCSI_S_BAD_TARGET: set_host_byte(sc, DID_BAD_TARGET); break; case VIRTIO_SCSI_S_RESET: set_host_byte(sc, DID_RESET); break; case VIRTIO_SCSI_S_BUSY: set_host_byte(sc, DID_BUS_BUSY); break; case VIRTIO_SCSI_S_TRANSPORT_FAILURE: set_host_byte(sc, DID_TRANSPORT_DISRUPTED); break; case VIRTIO_SCSI_S_TARGET_FAILURE: set_host_byte(sc, DID_BAD_TARGET); break; case VIRTIO_SCSI_S_NEXUS_FAILURE: set_status_byte(sc, SAM_STAT_RESERVATION_CONFLICT); break; default: scmd_printk(KERN_WARNING, sc, "Unknown response %d", resp->response); fallthrough; case VIRTIO_SCSI_S_FAILURE: set_host_byte(sc, DID_ERROR); break; } WARN_ON(virtio32_to_cpu(vscsi->vdev, resp->sense_len) > VIRTIO_SCSI_SENSE_SIZE); if (resp->sense_len) { memcpy(sc->sense_buffer, resp->sense, min_t(u32, virtio32_to_cpu(vscsi->vdev, resp->sense_len), VIRTIO_SCSI_SENSE_SIZE)); } scsi_done(sc); } static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtio_scsi_vq *virtscsi_vq, void (*fn)(struct virtio_scsi *vscsi, void *buf)) { void *buf; unsigned int len; unsigned long flags; struct virtqueue *vq = virtscsi_vq->vq; spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); do { virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) fn(vscsi, buf); } while (!virtqueue_enable_cb(vq)); spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } static void virtscsi_req_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); int index = vq->index - VIRTIO_SCSI_VQ_BASE; struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index]; virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd); }; static void virtscsi_poll_requests(struct virtio_scsi *vscsi) { int i, num_vqs; num_vqs = vscsi->num_queues; for (i = 0; i < num_vqs; i++) virtscsi_vq_done(vscsi, &vscsi->req_vqs[i], virtscsi_complete_cmd); } static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; if (cmd->comp) complete(cmd->comp); } static void virtscsi_ctrl_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free); }; static void virtscsi_handle_event(struct work_struct *work); static int virtscsi_kick_event(struct virtio_scsi *vscsi, struct virtio_scsi_event_node *event_node) { int err; struct scatterlist sg; unsigned long flags; INIT_WORK(&event_node->work, virtscsi_handle_event); sg_init_one(&sg, &event_node->event, sizeof(struct virtio_scsi_event)); spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); return err; } static int virtscsi_kick_event_all(struct virtio_scsi *vscsi) { int i; for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) { vscsi->event_list[i].vscsi = vscsi; virtscsi_kick_event(vscsi, &vscsi->event_list[i]); } return 0; } static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) { int i; /* Stop scheduling work before calling cancel_work_sync. */ spin_lock_irq(&vscsi->event_vq.vq_lock); vscsi->stop_events = true; spin_unlock_irq(&vscsi->event_vq.vq_lock); for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) cancel_work_sync(&vscsi->event_list[i].work); } static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); unsigned int target = event->lun[1]; unsigned int lun = (event->lun[2] << 8) | event->lun[3]; switch (virtio32_to_cpu(vscsi->vdev, event->reason)) { case VIRTIO_SCSI_EVT_RESET_RESCAN: if (lun == 0) { scsi_scan_target(&shost->shost_gendev, 0, target, SCAN_WILD_CARD, SCSI_SCAN_INITIAL); } else { scsi_add_device(shost, 0, target, lun); } break; case VIRTIO_SCSI_EVT_RESET_REMOVED: sdev = scsi_device_lookup(shost, 0, target, lun); if (sdev) { scsi_remove_device(sdev); scsi_device_put(sdev); } else { pr_err("SCSI device %d 0 %d %d not found\n", shost->host_no, target, lun); } break; default: pr_info("Unsupported virtio scsi event reason %x\n", event->reason); } } static void virtscsi_handle_param_change(struct virtio_scsi *vscsi, struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); unsigned int target = event->lun[1]; unsigned int lun = (event->lun[2] << 8) | event->lun[3]; u8 asc = virtio32_to_cpu(vscsi->vdev, event->reason) & 255; u8 ascq = virtio32_to_cpu(vscsi->vdev, event->reason) >> 8; sdev = scsi_device_lookup(shost, 0, target, lun); if (!sdev) { pr_err("SCSI device %d 0 %d %d not found\n", shost->host_no, target, lun); return; } /* Handle "Parameters changed", "Mode parameters changed", and "Capacity data has changed". */ if (asc == 0x2a && (ascq == 0x00 || ascq == 0x01 || ascq == 0x09)) scsi_rescan_device(sdev); scsi_device_put(sdev); } static int virtscsi_rescan_hotunplug(struct virtio_scsi *vscsi) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); unsigned char scsi_cmd[MAX_COMMAND_SIZE]; int result, inquiry_len, inq_result_len = 256; char *inq_result = kmalloc(inq_result_len, GFP_KERNEL); if (!inq_result) return -ENOMEM; shost_for_each_device(sdev, shost) { inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36; memset(scsi_cmd, 0, sizeof(scsi_cmd)); scsi_cmd[0] = INQUIRY; scsi_cmd[4] = (unsigned char) inquiry_len; memset(inq_result, 0, inq_result_len); result = scsi_execute_cmd(sdev, scsi_cmd, REQ_OP_DRV_IN, inq_result, inquiry_len, SD_TIMEOUT, SD_MAX_RETRIES, NULL); if (result == 0 && inq_result[0] >> 5) { /* PQ indicates the LUN is not attached */ scsi_remove_device(sdev); } else if (result > 0 && host_byte(result) == DID_BAD_TARGET) { /* * If all LUNs of a virtio-scsi device are unplugged * it will respond with BAD TARGET on any INQUIRY * command. * Remove the device in this case as well. */ scsi_remove_device(sdev); } } kfree(inq_result); return 0; } static void virtscsi_handle_event(struct work_struct *work) { struct virtio_scsi_event_node *event_node = container_of(work, struct virtio_scsi_event_node, work); struct virtio_scsi *vscsi = event_node->vscsi; struct virtio_scsi_event *event = &event_node->event; if (event->event & cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED)) { int ret; event->event &= ~cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED); ret = virtscsi_rescan_hotunplug(vscsi); if (ret) return; scsi_scan_host(virtio_scsi_host(vscsi->vdev)); } switch (virtio32_to_cpu(vscsi->vdev, event->event)) { case VIRTIO_SCSI_T_NO_EVENT: break; case VIRTIO_SCSI_T_TRANSPORT_RESET: virtscsi_handle_transport_reset(vscsi, event); break; case VIRTIO_SCSI_T_PARAM_CHANGE: virtscsi_handle_param_change(vscsi, event); break; default: pr_err("Unsupported virtio scsi event %x\n", event->event); } virtscsi_kick_event(vscsi, event_node); } static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_event_node *event_node = buf; if (!vscsi->stop_events) queue_work(system_freezable_wq, &event_node->work); } static void virtscsi_event_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event); }; static int __virtscsi_add_cmd(struct virtqueue *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size) { struct scsi_cmnd *sc = cmd->sc; struct scatterlist *sgs[6], req, resp; struct sg_table *out, *in; unsigned out_num = 0, in_num = 0; out = in = NULL; if (sc && sc->sc_data_direction != DMA_NONE) { if (sc->sc_data_direction != DMA_FROM_DEVICE) out = &sc->sdb.table; if (sc->sc_data_direction != DMA_TO_DEVICE) in = &sc->sdb.table; } /* Request header. */ sg_init_one(&req, &cmd->req, req_size); sgs[out_num++] = &req; /* Data-out buffer. */ if (out) { /* Place WRITE protection SGLs before Data OUT payload */ if (scsi_prot_sg_count(sc)) sgs[out_num++] = scsi_prot_sglist(sc); sgs[out_num++] = out->sgl; } /* Response header. */ sg_init_one(&resp, &cmd->resp, resp_size); sgs[out_num + in_num++] = &resp; /* Data-in buffer */ if (in) { /* Place READ protection SGLs before Data IN payload */ if (scsi_prot_sg_count(sc)) sgs[out_num + in_num++] = scsi_prot_sglist(sc); sgs[out_num + in_num++] = in->sgl; } return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_ATOMIC); } static void virtscsi_kick_vq(struct virtio_scsi_vq *vq) { bool needs_kick; unsigned long flags; spin_lock_irqsave(&vq->vq_lock, flags); needs_kick = virtqueue_kick_prepare(vq->vq); spin_unlock_irqrestore(&vq->vq_lock, flags); if (needs_kick) virtqueue_notify(vq->vq); } /** * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue, optionally kick it * @vq : the struct virtqueue we're talking about * @cmd : command structure * @req_size : size of the request buffer * @resp_size : size of the response buffer * @kick : whether to kick the virtqueue immediately */ static int virtscsi_add_cmd(struct virtio_scsi_vq *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size, bool kick) { unsigned long flags; int err; bool needs_kick = false; spin_lock_irqsave(&vq->vq_lock, flags); err = __virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size); if (!err && kick) needs_kick = virtqueue_kick_prepare(vq->vq); spin_unlock_irqrestore(&vq->vq_lock, flags); if (needs_kick) virtqueue_notify(vq->vq); return err; } static void virtio_scsi_init_hdr(struct virtio_device *vdev, struct virtio_scsi_cmd_req *cmd, struct scsi_cmnd *sc) { cmd->lun[0] = 1; cmd->lun[1] = sc->device->id; cmd->lun[2] = (sc->device->lun >> 8) | 0x40; cmd->lun[3] = sc->device->lun & 0xff; cmd->tag = cpu_to_virtio64(vdev, (unsigned long)sc); cmd->task_attr = VIRTIO_SCSI_S_SIMPLE; cmd->prio = 0; cmd->crn = 0; } #ifdef CONFIG_BLK_DEV_INTEGRITY static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, struct virtio_scsi_cmd_req_pi *cmd_pi, struct scsi_cmnd *sc) { struct request *rq = scsi_cmd_to_rq(sc); struct blk_integrity *bi; virtio_scsi_init_hdr(vdev, (struct virtio_scsi_cmd_req *)cmd_pi, sc); if (!rq || !scsi_prot_sg_count(sc)) return; bi = blk_get_integrity(rq->q->disk); if (sc->sc_data_direction == DMA_TO_DEVICE) cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, bio_integrity_bytes(bi, blk_rq_sectors(rq))); else if (sc->sc_data_direction == DMA_FROM_DEVICE) cmd_pi->pi_bytesin = cpu_to_virtio32(vdev, bio_integrity_bytes(bi, blk_rq_sectors(rq))); } #endif static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi, struct scsi_cmnd *sc) { u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(sc)); u16 hwq = blk_mq_unique_tag_to_hwq(tag); return &vscsi->req_vqs[hwq]; } static int virtscsi_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(shost); struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc); struct virtio_scsi_cmd *cmd = scsi_cmd_priv(sc); bool kick; unsigned long flags; int req_size; int ret; BUG_ON(scsi_sg_count(sc) > shost->sg_tablesize); /* TODO: check feature bit and fail if unsupported? */ BUG_ON(sc->sc_data_direction == DMA_BIDIRECTIONAL); dev_dbg(&sc->device->sdev_gendev, "cmd %p CDB: %#02x\n", sc, sc->cmnd[0]); cmd->sc = sc; BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); #ifdef CONFIG_BLK_DEV_INTEGRITY if (virtio_has_feature(vscsi->vdev, VIRTIO_SCSI_F_T10_PI)) { virtio_scsi_init_hdr_pi(vscsi->vdev, &cmd->req.cmd_pi, sc); memcpy(cmd->req.cmd_pi.cdb, sc->cmnd, sc->cmd_len); req_size = sizeof(cmd->req.cmd_pi); } else #endif { virtio_scsi_init_hdr(vscsi->vdev, &cmd->req.cmd, sc); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); req_size = sizeof(cmd->req.cmd); } kick = (sc->flags & SCMD_LAST) != 0; ret = virtscsi_add_cmd(req_vq, cmd, req_size, sizeof(cmd->resp.cmd), kick); if (ret == -EIO) { cmd->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET; spin_lock_irqsave(&req_vq->vq_lock, flags); virtscsi_complete_cmd(vscsi, cmd); spin_unlock_irqrestore(&req_vq->vq_lock, flags); } else if (ret != 0) { return SCSI_MLQUEUE_HOST_BUSY; } return 0; } static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) { DECLARE_COMPLETION_ONSTACK(comp); int ret = FAILED; cmd->comp = &comp; if (virtscsi_add_cmd(&vscsi->ctrl_vq, cmd, sizeof cmd->req.tmf, sizeof cmd->resp.tmf, true) < 0) goto out; wait_for_completion(&comp); if (cmd->resp.tmf.response == VIRTIO_SCSI_S_OK || cmd->resp.tmf.response == VIRTIO_SCSI_S_FUNCTION_SUCCEEDED) ret = SUCCESS; /* * The spec guarantees that all requests related to the TMF have * been completed, but the callback might not have run yet if * we're using independent interrupts (e.g. MSI). Poll the * virtqueues once. * * In the abort case, scsi_done() will do nothing, because the * command timed out and hence SCMD_STATE_COMPLETE has been set. */ virtscsi_poll_requests(vscsi); out: mempool_free(cmd, virtscsi_cmd_pool); return ret; } static int virtscsi_device_reset(struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(sc->device->host); struct virtio_scsi_cmd *cmd; sdev_printk(KERN_INFO, sc->device, "device reset\n"); cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO); if (!cmd) return FAILED; memset(cmd, 0, sizeof(*cmd)); cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ .type = VIRTIO_SCSI_T_TMF, .subtype = cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET), .lun[0] = 1, .lun[1] = sc->device->id, .lun[2] = (sc->device->lun >> 8) | 0x40, .lun[3] = sc->device->lun & 0xff, }; return virtscsi_tmf(vscsi, cmd); } static int virtscsi_device_alloc(struct scsi_device *sdevice) { /* * Passed through SCSI targets (e.g. with qemu's 'scsi-block') * may have transfer limits which come from the host SCSI * controller or something on the host side other than the * target itself. * * To make this work properly, the hypervisor can adjust the * target's VPD information to advertise these limits. But * for that to work, the guest has to look at the VPD pages, * which we won't do by default if it is an SPC-2 device, even * if it does actually support it. * * So, set the blist to always try to read the VPD pages. */ sdevice->sdev_bflags = BLIST_TRY_VPD_PAGES; return 0; } /** * virtscsi_change_queue_depth() - Change a virtscsi target's queue depth * @sdev: Virtscsi target whose queue depth to change * @qdepth: New queue depth */ static int virtscsi_change_queue_depth(struct scsi_device *sdev, int qdepth) { struct Scsi_Host *shost = sdev->host; int max_depth = shost->cmd_per_lun; return scsi_change_queue_depth(sdev, min(max_depth, qdepth)); } static int virtscsi_abort(struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(sc->device->host); struct virtio_scsi_cmd *cmd; scmd_printk(KERN_INFO, sc, "abort\n"); cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO); if (!cmd) return FAILED; memset(cmd, 0, sizeof(*cmd)); cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ .type = VIRTIO_SCSI_T_TMF, .subtype = VIRTIO_SCSI_T_TMF_ABORT_TASK, .lun[0] = 1, .lun[1] = sc->device->id, .lun[2] = (sc->device->lun >> 8) | 0x40, .lun[3] = sc->device->lun & 0xff, .tag = cpu_to_virtio64(vscsi->vdev, (unsigned long)sc), }; return virtscsi_tmf(vscsi, cmd); } static void virtscsi_map_queues(struct Scsi_Host *shost) { struct virtio_scsi *vscsi = shost_priv(shost); int i, qoff; for (i = 0, qoff = 0; i < shost->nr_maps; i++) { struct blk_mq_queue_map *map = &shost->tag_set.map[i]; map->nr_queues = vscsi->io_queues[i]; map->queue_offset = qoff; qoff += map->nr_queues; if (map->nr_queues == 0) continue; /* * Regular queues have interrupts and hence CPU affinity is * defined by the core virtio code, but polling queues have * no interrupts so we let the block layer assign CPU affinity. */ if (i == HCTX_TYPE_POLL) blk_mq_map_queues(map); else blk_mq_virtio_map_queues(map, vscsi->vdev, 2); } } static int virtscsi_mq_poll(struct Scsi_Host *shost, unsigned int queue_num) { struct virtio_scsi *vscsi = shost_priv(shost); struct virtio_scsi_vq *virtscsi_vq = &vscsi->req_vqs[queue_num]; unsigned long flags; unsigned int len; int found = 0; void *buf; spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); while ((buf = virtqueue_get_buf(virtscsi_vq->vq, &len)) != NULL) { virtscsi_complete_cmd(vscsi, buf); found++; } spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); return found; } static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq) { struct virtio_scsi *vscsi = shost_priv(shost); virtscsi_kick_vq(&vscsi->req_vqs[hwq]); } /* * The host guarantees to respond to each command, although I/O * latencies might be higher than on bare metal. Reset the timer * unconditionally to give the host a chance to perform EH. */ static enum scsi_timeout_action virtscsi_eh_timed_out(struct scsi_cmnd *scmnd) { return SCSI_EH_RESET_TIMER; } static const struct scsi_host_template virtscsi_host_template = { .module = THIS_MODULE, .name = "Virtio SCSI HBA", .proc_name = "virtio_scsi", .this_id = -1, .cmd_size = sizeof(struct virtio_scsi_cmd), .queuecommand = virtscsi_queuecommand, .mq_poll = virtscsi_mq_poll, .commit_rqs = virtscsi_commit_rqs, .change_queue_depth = virtscsi_change_queue_depth, .eh_abort_handler = virtscsi_abort, .eh_device_reset_handler = virtscsi_device_reset, .eh_timed_out = virtscsi_eh_timed_out, .slave_alloc = virtscsi_device_alloc, .dma_boundary = UINT_MAX, .map_queues = virtscsi_map_queues, .track_queue_depth = 1, }; #define virtscsi_config_get(vdev, fld) \ ({ \ __virtio_native_type(struct virtio_scsi_config, fld) __val; \ virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \ __val; \ }) #define virtscsi_config_set(vdev, fld, val) \ do { \ __virtio_native_type(struct virtio_scsi_config, fld) __val = (val); \ virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \ } while(0) static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, struct virtqueue *vq) { spin_lock_init(&virtscsi_vq->vq_lock); virtscsi_vq->vq = vq; } static void virtscsi_remove_vqs(struct virtio_device *vdev) { /* Stop all the virtqueues. */ virtio_reset_device(vdev); vdev->config->del_vqs(vdev); } static int virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi) { int err; u32 i; u32 num_vqs, num_poll_vqs, num_req_vqs; vq_callback_t **callbacks; const char **names; struct virtqueue **vqs; struct irq_affinity desc = { .pre_vectors = 2 }; num_req_vqs = vscsi->num_queues; num_vqs = num_req_vqs + VIRTIO_SCSI_VQ_BASE; vqs = kmalloc_array(num_vqs, sizeof(struct virtqueue *), GFP_KERNEL); callbacks = kmalloc_array(num_vqs, sizeof(vq_callback_t *), GFP_KERNEL); names = kmalloc_array(num_vqs, sizeof(char *), GFP_KERNEL); if (!callbacks || !vqs || !names) { err = -ENOMEM; goto out; } num_poll_vqs = min_t(unsigned int, virtscsi_poll_queues, num_req_vqs - 1); vscsi->io_queues[HCTX_TYPE_DEFAULT] = num_req_vqs - num_poll_vqs; vscsi->io_queues[HCTX_TYPE_READ] = 0; vscsi->io_queues[HCTX_TYPE_POLL] = num_poll_vqs; dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n", vscsi->io_queues[HCTX_TYPE_DEFAULT], vscsi->io_queues[HCTX_TYPE_READ], vscsi->io_queues[HCTX_TYPE_POLL]); callbacks[0] = virtscsi_ctrl_done; callbacks[1] = virtscsi_event_done; names[0] = "control"; names[1] = "event"; for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs - num_poll_vqs; i++) { callbacks[i] = virtscsi_req_done; names[i] = "request"; } for (; i < num_vqs; i++) { callbacks[i] = NULL; names[i] = "request_poll"; } /* Discover virtqueues and write information to configuration. */ err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); if (err) goto out; virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]); for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], vqs[i]); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); err = 0; out: kfree(names); kfree(callbacks); kfree(vqs); if (err) virtscsi_remove_vqs(vdev); return err; } static int virtscsi_probe(struct virtio_device *vdev) { struct Scsi_Host *shost; struct virtio_scsi *vscsi; int err; u32 sg_elems, num_targets; u32 cmd_per_lun; u32 num_queues; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } /* We need to know how many queues before we allocate. */ num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; num_queues = min_t(unsigned int, nr_cpu_ids, num_queues); num_targets = virtscsi_config_get(vdev, max_target) + 1; shost = scsi_host_alloc(&virtscsi_host_template, struct_size(vscsi, req_vqs, num_queues)); if (!shost) return -ENOMEM; sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; shost->sg_tablesize = sg_elems; shost->nr_maps = 1; vscsi = shost_priv(shost); vscsi->vdev = vdev; vscsi->num_queues = num_queues; vdev->priv = shost; err = virtscsi_init(vdev, vscsi); if (err) goto virtscsi_init_failed; if (vscsi->io_queues[HCTX_TYPE_POLL]) shost->nr_maps = HCTX_TYPE_POLL + 1; shost->can_queue = virtqueue_get_vring_size(vscsi->req_vqs[0].vq); cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; /* LUNs > 256 are reported with format 1, so they go in the range * 16640-32767. */ shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1 + 0x4000; shost->max_id = num_targets; shost->max_channel = 0; shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE; shost->nr_hw_queues = num_queues; #ifdef CONFIG_BLK_DEV_INTEGRITY if (virtio_has_feature(vdev, VIRTIO_SCSI_F_T10_PI)) { int host_prot; host_prot = SHOST_DIF_TYPE1_PROTECTION | SHOST_DIF_TYPE2_PROTECTION | SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE1_PROTECTION | SHOST_DIX_TYPE2_PROTECTION | SHOST_DIX_TYPE3_PROTECTION; scsi_host_set_prot(shost, host_prot); scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); } #endif err = scsi_add_host(shost, &vdev->dev); if (err) goto scsi_add_host_failed; virtio_device_ready(vdev); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); scsi_scan_host(shost); return 0; scsi_add_host_failed: vdev->config->del_vqs(vdev); virtscsi_init_failed: scsi_host_put(shost); return err; } static void virtscsi_remove(struct virtio_device *vdev) { struct Scsi_Host *shost = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(shost); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_cancel_event_work(vscsi); scsi_remove_host(shost); virtscsi_remove_vqs(vdev); scsi_host_put(shost); } #ifdef CONFIG_PM_SLEEP static int virtscsi_freeze(struct virtio_device *vdev) { virtscsi_remove_vqs(vdev); return 0; } static int virtscsi_restore(struct virtio_device *vdev) { struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); int err; err = virtscsi_init(vdev, vscsi); if (err) return err; virtio_device_ready(vdev); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); return err; } #endif static struct virtio_device_id id_table[] = { { VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID }, { 0 }, }; static unsigned int features[] = { VIRTIO_SCSI_F_HOTPLUG, VIRTIO_SCSI_F_CHANGE, #ifdef CONFIG_BLK_DEV_INTEGRITY VIRTIO_SCSI_F_T10_PI, #endif }; static struct virtio_driver virtio_scsi_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtscsi_probe, #ifdef CONFIG_PM_SLEEP .freeze = virtscsi_freeze, .restore = virtscsi_restore, #endif .remove = virtscsi_remove, }; static int __init virtio_scsi_init(void) { int ret = -ENOMEM; virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); if (!virtscsi_cmd_cache) { pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n"); goto error; } virtscsi_cmd_pool = mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, virtscsi_cmd_cache); if (!virtscsi_cmd_pool) { pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); goto error; } ret = register_virtio_driver(&virtio_scsi_driver); if (ret < 0) goto error; return 0; error: mempool_destroy(virtscsi_cmd_pool); virtscsi_cmd_pool = NULL; kmem_cache_destroy(virtscsi_cmd_cache); virtscsi_cmd_cache = NULL; return ret; } static void __exit virtio_scsi_fini(void) { unregister_virtio_driver(&virtio_scsi_driver); mempool_destroy(virtscsi_cmd_pool); kmem_cache_destroy(virtscsi_cmd_cache); } module_init(virtio_scsi_init); module_exit(virtio_scsi_fini); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio SCSI HBA driver"); MODULE_LICENSE("GPL");
50 69 26 2 76 45 10 35 76 76 74 74 51 36 36 32 32 93 65 102 64 16 3 11 16 55 18 22 55 55 51 23 23 23 22 23 56 35 70 22 99 99 23 99 54 54 3 53 26 52 69 17 23 22 2 42 56 1 54 11 39 71 55 1 140 119 8 69 1 1 106 94 3 69 8 55 104 83 46 89 49 23 23 94 2 24 6 21 21 3 1 4 1 57 57 119 119 54 32 85 49 120 37 107 107 42 34 34 42 42 1 25 25 25 1 76 6 49 74 5 73 15 7 4 1 1 2 1 1 1 1 2 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 /* * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "addr.h" #include "group.h" #include "bcast.h" #include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" #include "name_table.h" #include "subscr.h" #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, MBR_PENDING, MBR_ACTIVE, MBR_RECLAIMING, MBR_REMITTED, MBR_LEAVING }; struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; u16 advertised; u16 window; u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; }; struct tipc_group { struct rb_root members; struct list_head small_win; struct list_head pending; struct list_head active; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; u32 scope; u32 portid; u16 member_cnt; u16 active_cnt; u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool *open; bool loopback; bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); static void tipc_group_open(struct tipc_member *m, bool *wakeup) { *wakeup = false; if (list_empty(&m->small_win)) return; list_del_init(&m->small_win); *m->group->open = true; *wakeup = true; } static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || m->state == MBR_REMITTED) grp->active_cnt--; } static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; /* Limit simultaneous reception from other members */ max_active = min(mcnt / 8, 64); max_active = max(max_active, 16); grp->max_active = max_active; /* Reserve blocks for active and idle members */ active_pool = max_active * ADV_ACTIVE; idle_pool = (mcnt - max_active) * ADV_IDLE; /* Scale to bytes, considering worst-case truesize/msgsize ratio */ return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) { if (!grp->loopback) return grp->portid; return 0; } struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) { u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) return grp; kfree(grp); return NULL; } void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); } tipc_node_distr_xmit(net, &xmitq); *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); __skb_queue_purge(&m->deferredq); list_del(&m->list); kfree(m); } tipc_node_distr_xmit(net, &xmitq); tipc_nlist_purge(&grp->dests); tipc_topsrv_kern_unsubscr(net, grp->subid); kfree(grp); } static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; struct tipc_member *m; while (n) { m = container_of(n, struct tipc_member, tree_node); nkey = (u64)m->node << 32 | m->port; if (key < nkey) n = n->rb_left; else if (key > nkey) n = n->rb_right; else return m; } return NULL; } static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, u32 node, u32 port) { struct tipc_member *m; m = tipc_group_find_member(grp, node, port); if (m && tipc_group_is_receiver(m)) return m; return NULL; } static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (m->node == node) return m; } return NULL; } static int tipc_group_add_to_tree(struct tipc_group *grp, struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; struct tipc_member *tmp; n = &grp->members.rb_node; while (*n) { tmp = container_of(*n, struct tipc_member, tree_node); parent = *n; tmp = container_of(parent, struct tipc_member, tree_node); nkey = (u64)tmp->node << 32 | tmp->port; if (key < nkey) n = &(*n)->rb_left; else if (key > nkey) n = &(*n)->rb_right; else return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); return 0; } static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, u32 instance, int state) { struct tipc_member *m; int ret; m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; ret = tipc_group_add_to_tree(grp, m); if (ret < 0) { kfree(m); return NULL; } grp->member_cnt++; tipc_nlist_add(&grp->dests, m->node); m->state = state; return m; } void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance) { tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, struct tipc_member *m) { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; /* Check if we were waiting for replicast ack from this member */ if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) grp->bc_ackers--; list_del_init(&m->list); list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) tipc_nlist_del(&grp->dests, m->node); kfree(m); } struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) { return &grp->dests; } void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; seq->lower = grp->instance; seq->upper = grp->instance; *scope = grp->scope; } void tipc_group_update_member(struct tipc_member *m, int len) { struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; if (!tipc_group_is_receiver(m)) return; m->window -= len; if (m->window >= ADV_IDLE) return; list_del_init(&m->small_win); /* Sort member into small_window members' list */ list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { if (_m->window > m->window) break; } list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) grp->bc_ackers = ackers; grp->bc_snd_nxt++; } bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **mbr) { struct sk_buff_head xmitq; struct tipc_member *m; int adv, state; m = tipc_group_find_dest(grp, dnode, dport); if (!tipc_group_is_receiver(m)) { *mbr = NULL; return false; } *mbr = m; if (m->window >= len) return false; *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; } bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ if (grp->bc_ackers) { *grp->open = false; return true; } if (list_empty(&grp->small_win)) return false; m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_sort_msg() - sort msg into queue by bcast sequence number */ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) { struct tipc_msg *_hdr, *hdr = buf_msg(skb); u16 bc_seqno = msg_grp_bc_seqno(hdr); struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) continue; __skb_queue_before(defq, _skb, skb); return; } /* Bcast was not bypassed, - add to tail */ } /* Unicasts are never bypassed, - always add to tail */ __skb_queue_tail(defq, skb); } /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; u32 node, port; int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; defq = &m->deferredq; tipc_group_sort_msg(skb, defq); while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); blks = msg_blocks(hdr); deliver = true; ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) break; /* Decide what to do with message */ switch (mtyp) { case TIPC_GRP_MCAST_MSG: if (msg_nameinst(hdr) != grp->instance) { update = true; deliver = false; } fallthrough; case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; case TIPC_GRP_MEMBER_EVT: if (m->state == MBR_LEAVING) leave = true; if (!grp->events) deliver = false; break; default: break; } /* Execute decisions */ __skb_dequeue(defq); if (deliver) __skb_queue_tail(inputq, skb); else kfree_skb(skb); if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { __skb_queue_purge(defq); tipc_group_delete_member(grp, m); break; } if (!update) continue; tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; drop: kfree_skb(skb); } void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { struct list_head *active = &grp->active; int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) return; m->advertised -= blks; switch (m->state) { case MBR_JOINED: /* First, decide if member can go active */ if (active_cnt <= max_active) { m->state = MBR_ACTIVE; list_add_tail(&m->list, active); grp->active_cnt++; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } else { m->state = MBR_PENDING; list_add_tail(&m->list, &grp->pending); } if (active_cnt < reclaim_limit) break; /* Reclaim from oldest active member, if possible */ if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); break; } /* Nobody to reclaim from; - revert oldest pending to JOINED */ pm = list_first_entry(&grp->pending, struct tipc_member, list); list_del_init(&pm->list); pm->state = MBR_JOINED; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); if (m->advertised > (ADV_ACTIVE * 3 / 4)) break; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); break; case MBR_REMITTED: if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } if (list_empty(&grp->pending)) return; /* Set oldest pending member to active and advertise */ pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_JOINING: case MBR_LEAVING: default: break; } } static void tipc_group_create_event(struct tipc_group *grp, struct tipc_member *m, u32 event, u16 seqno, struct sk_buff_head *inputq) { u32 dnode = tipc_own_addr(grp->net); struct tipc_event evt; struct sk_buff *skb; struct tipc_msg *hdr; memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; evt.port.ref = m->port; evt.port.node = m->node; evt.s.seq.type = grp->type; evt.s.seq.lower = m->instance; evt.s.seq.upper = m->instance; skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, GROUP_H_SIZE, sizeof(evt), dnode, m->node, grp->portid, m->port, 0); if (!skb) return; hdr = buf_msg(skb); msg_set_nametype(hdr, grp->type); msg_set_grp_evt(hdr, event); msg_set_dest_droppable(hdr, true); msg_set_grp_bc_seqno(hdr, seqno); memcpy(msg_data(hdr), &evt, sizeof(evt)); TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); } static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), m->port, grp->portid, 0); if (!skb) return; if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; else if (m->state == MBR_JOINED || m->state == MBR_PENDING) adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_LEAVE_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; if (!grp) return; if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) return; m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received if necessary */ if (m->state != MBR_PUBLISHED) return; /* Member can be taken into service */ m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) return; list_del_init(&m->small_win); *m->group->open = true; *usr_wakeup = true; tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; m->advertised = ADV_IDLE + in_flight; return; } /* This should never happen */ if (m->advertised < remitted) pr_warn_ratelimited("Unexpected REMIT msg\n"); /* All messages preceding the REMIT have been read */ m->state = MBR_JOINED; grp->active_cnt--; m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) return; pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } /* tipc_group_member_evt() - receive and handle a member up/down event */ void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; int event = evt->event; struct tipc_member *m; struct net *net; u32 self; if (!grp) return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) return; m = tipc_group_find_member(grp, node, port); switch (event) { case TIPC_PUBLISHED: /* Send and wait for arrival of JOIN message if necessary */ if (!m) { m = tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); if (!m) break; tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); break; } if (m->state != MBR_JOINING) break; /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); break; case TIPC_WITHDRAWN: if (!m) break; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); break; default: break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, grp->type) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, grp->instance) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, grp->bc_snd_nxt)) goto group_msg_cancel; if (grp->scope == TIPC_NODE_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) goto group_msg_cancel; if (grp->scope == TIPC_CLUSTER_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) goto group_msg_cancel; if (*grp->open) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) goto group_msg_cancel; nla_nest_end(skb, group); return 0; group_msg_cancel: nla_nest_cancel(skb, group); return -1; }
1 1 1 1 1 36 36 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011, 2012 STRATO. All rights reserved. */ #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <crypto/hash.h> #include "ctree.h" #include "discard.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" #include "transaction.h" #include "backref.h" #include "extent_io.h" #include "dev-replace.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" #include "fs.h" #include "accessors.h" #include "file-item.h" #include "scrub.h" #include "raid-stripe-tree.h" /* * This is only the first step towards a full-features scrub. It reads all * extent and super block and verifies the checksums. In case a bad checksum * is found or the extent cannot be read, good data will be written back if * any can be found. * * Future enhancements: * - In case an unrepairable extent is encountered, track which files are * affected and report them * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space */ struct scrub_ctx; /* * The following value only influences the performance. * * This determines how many stripes would be submitted in one go, * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ #define SCRUB_STRIPES_PER_GROUP 8 /* * How many groups we have for each sctx. * * This would be 8M per device, the same value as the old scrub in-flight bios * size limit. */ #define SCRUB_GROUPS_PER_SCTX 16 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) /* * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. */ #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) /* Represent one sector and its needed info to verify the content. */ struct scrub_sector_verification { bool is_metadata; union { /* * Csum pointer for data csum verification. Should point to a * sector csum inside scrub_stripe::csums. * * NULL if this data sector has no csum. */ u8 *csum; /* * Extra info for metadata verification. All sectors inside a * tree block share the same generation. */ u64 generation; }; }; enum scrub_stripe_flags { /* Set when @mirror_num, @dev, @physical and @logical are set. */ SCRUB_STRIPE_FLAG_INITIALIZED, /* Set when the read-repair is finished. */ SCRUB_STRIPE_FLAG_REPAIR_DONE, /* * Set for data stripes if it's triggered from P/Q stripe. * During such scrub, we should not report errors in data stripes, nor * update the accounting. */ SCRUB_STRIPE_FLAG_NO_REPORT, }; #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) /* * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. */ struct scrub_stripe { struct scrub_ctx *sctx; struct btrfs_block_group *bg; struct page *pages[SCRUB_STRIPE_PAGES]; struct scrub_sector_verification *sectors; struct btrfs_device *dev; u64 logical; u64 physical; u16 mirror_num; /* Should be BTRFS_STRIPE_LEN / sectorsize. */ u16 nr_sectors; /* * How many data/meta extents are in this stripe. Only for scrub status * reporting purposes. */ u16 nr_data_extents; u16 nr_meta_extents; atomic_t pending_io; wait_queue_head_t io_wait; wait_queue_head_t repair_wait; /* * Indicate the states of the stripe. Bits are defined in * scrub_stripe_flags enum. */ unsigned long state; /* Indicate which sectors are covered by extent items. */ unsigned long extent_sector_bitmap; /* * The errors hit during the initial read of the stripe. * * Would be utilized for error reporting and repair. * * The remaining init_nr_* records the number of errors hit, only used * by error reporting. */ unsigned long init_error_bitmap; unsigned int init_nr_io_errors; unsigned int init_nr_csum_errors; unsigned int init_nr_meta_errors; /* * The following error bitmaps are all for the current status. * Every time we submit a new read, these bitmaps may be updated. * * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; * * IO and csum errors can happen for both metadata and data. */ unsigned long error_bitmap; unsigned long io_error_bitmap; unsigned long csum_error_bitmap; unsigned long meta_error_bitmap; /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; /* Writeback can be concurrent, thus we need to protect the bitmap. */ spinlock_t write_error_lock; /* * Checksum for the whole stripe if this stripe is inside a data block * group. */ u8 *csums; struct work_struct work; }; struct scrub_ctx { struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; struct btrfs_path extent_path; struct btrfs_path csum_path; int first_free; int cur_stripe; atomic_t cancel_req; int readonly; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; u64 throttle_sent; int is_dev_replace; u64 write_pointer; struct mutex wr_lock; struct btrfs_device *wr_tgtdev; /* * statistics */ struct btrfs_scrub_progress stat; spinlock_t stat_lock; /* * Use a ref counter to avoid use-after-free issues. Scrub workers * decrement bios_in_flight and workers_pending and then do a wakeup * on the list_wait wait queue. We must ensure the main scrub task * doesn't free the scrub context before or while the workers are * doing the wakeup() call. */ refcount_t refs; }; struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; const char *errstr; u64 physical; u64 logical; struct btrfs_device *dev; }; static void release_scrub_stripe(struct scrub_stripe *stripe) { if (!stripe) return; for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { if (stripe->pages[i]) __free_page(stripe->pages[i]); stripe->pages[i] = NULL; } kfree(stripe->sectors); kfree(stripe->csums); stripe->sectors = NULL; stripe->csums = NULL; stripe->sctx = NULL; stripe->state = 0; } static int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe) { int ret; memset(stripe, 0, sizeof(*stripe)); stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; stripe->state = 0; init_waitqueue_head(&stripe->io_wait); init_waitqueue_head(&stripe->repair_wait); atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0); if (ret < 0) goto error; stripe->sectors = kcalloc(stripe->nr_sectors, sizeof(struct scrub_sector_verification), GFP_KERNEL); if (!stripe->sectors) goto error; stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, fs_info->csum_size, GFP_KERNEL); if (!stripe->csums) goto error; return 0; error: release_scrub_stripe(stripe); return -ENOMEM; } static void wait_scrub_stripe_io(struct scrub_stripe *stripe) { wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); } static void scrub_put_ctx(struct scrub_ctx *sctx); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) { while (atomic_read(&fs_info->scrub_pause_req)) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, atomic_read(&fs_info->scrub_pause_req) == 0); mutex_lock(&fs_info->scrub_lock); } } static void scrub_pause_on(struct btrfs_fs_info *fs_info) { atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); } static void scrub_pause_off(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); __scrub_blocked_if_needed(fs_info); atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); } static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) { scrub_pause_on(fs_info); scrub_pause_off(fs_info); } static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; if (!sctx) return; for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) release_scrub_stripe(&sctx->stripes[i]); kvfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) { if (refcount_dec_and_test(&sctx->refs)) scrub_free_ctx(sctx); } static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( struct btrfs_fs_info *fs_info, int is_dev_replace) { struct scrub_ctx *sctx; int i; /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use * kvzalloc(). */ sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; sctx->extent_path.search_commit_root = 1; sctx->extent_path.skip_locking = 1; sctx->csum_path.search_commit_root = 1; sctx->csum_path.skip_locking = 1; for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); if (ret < 0) goto nomem; sctx->stripes[i].sctx = sctx; } sctx->first_free = 0; atomic_set(&sctx->cancel_req, 0); spin_lock_init(&sctx->stat_lock); sctx->throttle_deadline = 0; mutex_init(&sctx->wr_lock); if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; } return sctx; nomem: scrub_free_ctx(sctx); return ERR_PTR(-ENOMEM); } static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) { u32 nlink; int ret; int i; unsigned nofs_flag; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key key; local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; } /* * this makes the path point to (inum INODE_ITEM ioff) */ key.objectid = inum; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { btrfs_put_root(local_root); btrfs_release_path(swarn->path); goto err; } eb = swarn->path->nodes[0]; inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], struct btrfs_inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); /* * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub * uses GFP_NOFS in this context, so we keep it consistent but it does * not seem to be strictly necessary. */ nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, swarn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; goto err; } ret = paths_from_inode(inum, ipath); if (ret < 0) goto err; /* * we deliberately ignore the bit ipath might have been too small to * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); free_ipath(ipath); return 0; err: btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, ret); free_ipath(ipath); return 0; } static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, bool is_super, u64 logical, u64 physical) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; u64 flags = 0; u32 item_size; int ret; /* Super block error, no need to search extent tree. */ if (is_super) { btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } path = btrfs_alloc_path(); if (!path) return; swarn.physical = physical; swarn.logical = logical; swarn.errstr = errstr; swarn.dev = NULL; ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); if (ret < 0) goto out; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { unsigned long ptr = 0; u8 ref_level; u64 ref_root; while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); if (ret < 0) { btrfs_warn(fs_info, "failed to resolve tree backref for logical %llu: %d", swarn.logical, ret); break; } if (ret > 0) break; btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); } btrfs_release_path(path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; btrfs_release_path(path); ctx.bytenr = found_key.objectid; ctx.extent_item_pos = swarn.logical - found_key.objectid; ctx.fs_info = fs_info; swarn.path = path; swarn.dev = dev; iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } out: btrfs_free_path(path); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; u64 length; if (!btrfs_is_zoned(sctx->fs_info)) return 0; if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) return 0; if (sctx->write_pointer < physical) { length = physical - sctx->write_pointer; ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, sctx->write_pointer, length); if (!ret) sctx->write_pointer = physical; } return ret; } static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; return stripe->pages[page_index]; } static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, int sector_nr) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; return offset_in_page(sector_nr << fs_info->sectorsize_bits); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; struct btrfs_header *header; /* * Here we don't have a good way to attach the pages (and subpages) * to a dummy extent buffer, thus we have to directly grab the members * from pages. */ header = (struct btrfs_header *)(page_address(first_page) + first_off); memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; } if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) { bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; } if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE) != 0) { bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; } /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); crypto_shash_update(shash, page_address(first_page) + first_off + BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { struct page *page = scrub_stripe_get_page(stripe, i); unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); crypto_shash_update(shash, page_address(page) + page_off, fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); return; } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); return; } bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; struct page *page = scrub_stripe_get_page(stripe, sector_nr); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); /* Sector not utilized, skip it. */ if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) return; /* IO error, no need to check. */ if (test_bit(sector_nr, &stripe->io_error_bitmap)) return; /* Metadata, verify the full tree block. */ if (sector->is_metadata) { /* * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a * warning. * * This can only happen on a very old filesystem where chunks * are not ensured to be stripe aligned. */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, "tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); return; } scrub_verify_one_metadata(stripe, sector_nr); return; } /* * Data is easier, we just verify the data csum (if we have it). For * cases without csum, we have no other choice but to trust it. */ if (!sector->csum) { clear_bit(sector_nr, &stripe->error_bitmap); return; } ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); if (ret < 0) { set_bit(sector_nr, &stripe->csum_error_bitmap); set_bit(sector_nr, &stripe->error_bitmap); } else { clear_bit(sector_nr, &stripe->csum_error_bitmap); clear_bit(sector_nr, &stripe->error_bitmap); } } /* Verify specified sectors of a stripe. */ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; int sector_nr; for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { scrub_verify_one_sector(stripe, sector_nr); if (stripe->sectors[sector_nr].is_metadata) sector_nr += sectors_per_tree - 1; } } static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) { int i; for (i = 0; i < stripe->nr_sectors; i++) { if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) break; } ASSERT(i < stripe->nr_sectors); return i; } /* * Repair read is different to the regular read: * * - Only reads the failed sectors * - May have extra blocksize limits */ static void scrub_repair_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct bio_vec *bvec; int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); u32 bio_size = 0; int i; ASSERT(sector_nr < stripe->nr_sectors); bio_for_each_bvec_all(bvec, &bbio->bio, i) bio_size += bvec->bv_len; if (bbio->bio.bi_status) { bitmap_set(&stripe->io_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); bitmap_set(&stripe->error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); } else { bitmap_clear(&stripe->io_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) wake_up(&stripe->io_wait); } static int calc_next_mirror(int mirror, int num_copies) { ASSERT(mirror <= num_copies); return (mirror + 1 > num_copies) ? 1 : mirror + 1; } static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, int mirror, int blocksize, bool wait) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; const unsigned long old_error_bitmap = stripe->error_bitmap; int i; ASSERT(stripe->mirror_num >= 1); ASSERT(atomic_read(&stripe->pending_io) == 0); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { struct page *page; int pgoff; int ret; page = scrub_stripe_get_page(stripe, i); pgoff = scrub_stripe_get_page_offset(stripe, i); /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); btrfs_submit_bio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; } if (!bbio) { bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, fs_info, scrub_repair_read_endio, stripe); bbio->bio.bi_iter.bi_sector = (stripe->logical + (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); ASSERT(ret == fs_info->sectorsize); } if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); btrfs_submit_bio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } } static void scrub_stripe_report_errors(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = NULL; u64 physical = 0; int nr_data_sectors = 0; int nr_meta_sectors = 0; int nr_nodatacsum_sectors = 0; int nr_repaired_sectors = 0; int sector_nr; if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) return; /* * Init needed infos for error reporting. * * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { u64 mapped_len = fs_info->sectorsize; struct btrfs_io_context *bioc = NULL; int stripe_index = stripe->mirror_num - 1; int ret; /* For scrub, our mirror_num should always start at 1. */ ASSERT(stripe->mirror_num >= 1); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, NULL, NULL); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. */ if (ret < 0) goto skip; physical = bioc->stripes[stripe_index].physical; dev = bioc->stripes[stripe_index].dev; btrfs_put_bioc(bioc); } skip: for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { bool repaired = false; if (stripe->sectors[sector_nr].is_metadata) { nr_meta_sectors++; } else { nr_data_sectors++; if (!stripe->sectors[sector_nr].csum) nr_nodatacsum_sectors++; } if (test_bit(sector_nr, &stripe->init_error_bitmap) && !test_bit(sector_nr, &stripe->error_bitmap)) { nr_repaired_sectors++; repaired = true; } /* Good sector from the beginning, nothing need to be done. */ if (!test_bit(sector_nr, &stripe->init_error_bitmap)) continue; /* * Report error for the corrupted sectors. If repaired, just * output the message of repaired message. */ if (repaired) { if (dev) { btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; } /* The remaining are all for unrepaired. */ if (dev) { btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } if (test_bit(sector_nr, &stripe->io_error_bitmap)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("i/o error", dev, false, stripe->logical, physical); if (test_bit(sector_nr, &stripe->csum_error_bitmap)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("checksum error", dev, false, stripe->logical, physical); if (test_bit(sector_nr, &stripe->meta_error_bitmap)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); } spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; sctx->stat.read_errors += stripe->init_nr_io_errors; sctx->stat.csum_errors += stripe->init_nr_csum_errors; sctx->stat.verify_errors += stripe->init_nr_meta_errors; sctx->stat.uncorrectable_errors += bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; spin_unlock(&sctx->stat_lock); } static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, unsigned long write_bitmap, bool dev_replace); /* * The main entrance for all read related scrub work, including: * * - Wait for the initial read to finish * - Verify and locate any bad sectors * - Go through the remaining mirrors and try to read as large blocksize as * possible * - Go through all mirrors (including the failed mirror) sector-by-sector * - Submit writeback for repaired sectors * * Writeback for dev-replace does not happen here, it needs extra * synchronization for zoned devices. */ static void scrub_stripe_read_repair_worker(struct work_struct *work) { struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); struct scrub_ctx *sctx = stripe->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); int mirror; int i; ASSERT(stripe->mirror_num > 0); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); /* Save the initial failed bitmap for later repair and report usage. */ stripe->init_error_bitmap = stripe->error_bitmap; stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors); stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors); stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) goto out; /* * Try all remaining mirrors. * * Here we still try to read as large block as possible, as this is * faster and we have extra safety nets to rely on. */ for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); mirror != stripe->mirror_num; mirror = calc_next_mirror(mirror, num_copies)) { const unsigned long old_error_bitmap = stripe->error_bitmap; scrub_stripe_submit_repair_read(stripe, mirror, BTRFS_STRIPE_LEN, false); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) goto out; } /* * Last safety net, try re-checking all mirrors, including the failed * one, sector-by-sector. * * As if one sector failed the drive's internal csum, the whole read * containing the offending sector would be marked as error. * Thus here we do sector-by-sector read. * * This can be slow, thus we only try it as the last resort. */ for (i = 0, mirror = stripe->mirror_num; i < num_copies; i++, mirror = calc_next_mirror(mirror, num_copies)) { const unsigned long old_error_bitmap = stripe->error_bitmap; scrub_stripe_submit_repair_read(stripe, mirror, fs_info->sectorsize, true); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) goto out; } out: /* * Submit the repaired sectors. For zoned case, we cannot do repair * in-place, but queue the bg to be relocated. */ if (btrfs_is_zoned(fs_info)) { if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); } else if (!sctx->readonly) { unsigned long repaired; bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap, stripe->nr_sectors); scrub_write_sectors(sctx, stripe, repaired, false); wait_scrub_stripe_io(stripe); } scrub_stripe_report_errors(sctx, stripe); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; struct bio_vec *bvec; int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); int num_sectors; u32 bio_size = 0; int i; ASSERT(sector_nr < stripe->nr_sectors); bio_for_each_bvec_all(bvec, &bbio->bio, i) bio_size += bvec->bv_len; num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { wake_up(&stripe->io_wait); INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); } } static void scrub_write_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct bio_vec *bvec; int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); u32 bio_size = 0; int i; bio_for_each_bvec_all(bvec, &bbio->bio, i) bio_size += bvec->bv_len; if (bbio->bio.bi_status) { unsigned long flags; spin_lock_irqsave(&stripe->write_error_lock, flags); bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) wake_up(&stripe->io_wait); } static void scrub_submit_write_bio(struct scrub_ctx *sctx, struct scrub_stripe *stripe, struct btrfs_bio *bbio, bool dev_replace) { struct btrfs_fs_info *fs_info = sctx->fs_info; u32 bio_len = bbio->bio.bi_iter.bi_size; u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - stripe->logical; fill_writer_pointer_gap(sctx, stripe->physical + bio_off); atomic_inc(&stripe->pending_io); btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); if (!btrfs_is_zoned(fs_info)) return; /* * For zoned writeback, queue depth must be 1, thus we must wait for * the write to finish before the next write. */ wait_scrub_stripe_io(stripe); /* * And also need to update the write pointer if write finished * successfully. */ if (!test_bit(bio_off >> fs_info->sectorsize_bits, &stripe->write_error_bitmap)) sctx->write_pointer += bio_len; } /* * Submit the write bio(s) for the sectors specified by @write_bitmap. * * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: * * - Only needs logical bytenr and mirror_num * Just like the scrub read path * * - Would only result in writes to the specified mirror * Unlike the regular writeback path, which would write back to all stripes * * - Handle dev-replace and read-repair writeback differently */ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, unsigned long write_bitmap, bool dev_replace) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; int sector_nr; for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { struct page *page = scrub_stripe_get_page(stripe, sector_nr); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); int ret; /* We should only writeback sectors covered by an extent. */ ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); /* Cannot merge with previous sector, submit the current one. */ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); bbio = NULL; } if (!bbio) { bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, fs_info, scrub_write_endio, stripe); bbio->bio.bi_iter.bi_sector = (stripe->logical + (sector_nr << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); ASSERT(ret == fs_info->sectorsize); } if (bbio) scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); } /* * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. */ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, unsigned int bio_size) { const int time_slice = 1000; s64 delta; ktime_t now; u32 div; u64 bwlimit; bwlimit = READ_ONCE(device->scrub_speed_max); if (bwlimit == 0) return; /* * Slice is divided into intervals when the IO is submitted, adjust by * bwlimit and maximum of 64 intervals. */ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); div = min_t(u32, 64, div); /* Start new epoch, set deadline */ now = ktime_get(); if (sctx->throttle_deadline == 0) { sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); sctx->throttle_sent = 0; } /* Still in the time to send? */ if (ktime_before(now, sctx->throttle_deadline)) { /* If current bio is within the limit, send it */ sctx->throttle_sent += bio_size; if (sctx->throttle_sent <= div_u64(bwlimit, div)) return; /* We're over the limit, sleep until the rest of the slice */ delta = ktime_ms_delta(sctx->throttle_deadline, now); } else { /* New request after deadline, start new epoch */ delta = 0; } if (delta) { long timeout; timeout = div_u64(delta * HZ, 1000); schedule_timeout_interruptible(timeout); } /* Next call will start the deadline period */ sctx->throttle_deadline = 0; } /* * Given a physical address, this will calculate it's * logical offset. if this is a parity stripe, it will return * the most left data stripe's logical offset. * * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, struct btrfs_chunk_map *map, u64 *offset, u64 *stripe_start) { int i; int j = 0; u64 last_offset; const int data_stripes = nr_data_stripes(map); last_offset = (physical - map->stripes[num].physical) * data_stripes; if (stripe_start) *stripe_start = last_offset; *offset = last_offset; for (i = 0; i < data_stripes; i++) { u32 stripe_nr; u32 stripe_index; u32 rot; *offset = last_offset + btrfs_stripe_nr_to_offset(i); stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; /* Work out the disk rotation on this stripe-set */ rot = stripe_nr % map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; if (stripe_index == num) return 0; if (stripe_index < num) j++; } *offset = last_offset + btrfs_stripe_nr_to_offset(j); return 1; } /* * Return 0 if the extent item range covers any byte of the range. * Return <0 if the extent item is before @search_start. * Return >0 if the extent item is after @start_start + @search_len. */ static int compare_extent_item_range(struct btrfs_path *path, u64 search_start, u64 search_len) { struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; u64 len; struct btrfs_key key; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY); if (key.type == BTRFS_METADATA_ITEM_KEY) len = fs_info->nodesize; else len = key.offset; if (key.objectid + len <= search_start) return -1; if (key.objectid >= search_start + search_len) return 1; return 0; } /* * Locate one extent item which covers any byte in range * [@search_start, @search_start + @search_length) * * If the path is not initialized, we will initialize the search by doing * a btrfs_search_slot(). * If the path is already initialized, we will use the path as the initial * slot, to avoid duplicated btrfs_search_slot() calls. * * NOTE: If an extent item starts before @search_start, we will still * return the extent item. This is for data extent crossing stripe boundary. * * Return 0 if we found such extent item, and @path will point to the extent item. * Return >0 if no such extent item can be found, and @path will be released. * Return <0 if hit fatal error, and @path will be released. */ static int find_first_extent_item(struct btrfs_root *extent_root, struct btrfs_path *path, u64 search_start, u64 search_len) { struct btrfs_fs_info *fs_info = extent_root->fs_info; struct btrfs_key key; int ret; /* Continue using the existing path */ if (path->nodes[0]) goto search_forward; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ btrfs_release_path(path); return -EUCLEAN; } /* * Here we intentionally pass 0 as @min_objectid, as there could be * an extent item starting before @search_start. */ ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret < 0) return ret; /* * No matter whether we have found an extent item, the next loop will * properly do every check on the key. */ search_forward: while (true) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid >= search_start + search_len) break; if (key.type != BTRFS_METADATA_ITEM_KEY && key.type != BTRFS_EXTENT_ITEM_KEY) goto next; ret = compare_extent_item_range(path, search_start, search_len); if (ret == 0) return ret; if (ret > 0) break; next: ret = btrfs_next_item(extent_root, path); if (ret) { /* Either no more items or a fatal error. */ btrfs_release_path(path); return ret; } } btrfs_release_path(path); return 1; } static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, u64 *size_ret, u64 *flags_ret, u64 *generation_ret) { struct btrfs_key key; struct btrfs_extent_item *ei; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || key.type == BTRFS_EXTENT_ITEM_KEY); *extent_start_ret = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) *size_ret = path->nodes[0]->fs_info->nodesize; else *size_ret = key.offset; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); *flags_ret = btrfs_extent_flags(path->nodes[0], ei); *generation_ret = btrfs_extent_generation(path->nodes[0], ei); } static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, u64 physical, u64 physical_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; int ret = 0; if (!btrfs_is_zoned(fs_info)) return 0; mutex_lock(&sctx->wr_lock); if (sctx->write_pointer < physical_end) { ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, physical, sctx->write_pointer); if (ret) btrfs_err(fs_info, "zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); return ret; } static void fill_one_extent_info(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe, u64 extent_start, u64 extent_len, u64 extent_flags, u64 extent_gen) { for (u64 cur_logical = max(stripe->logical, extent_start); cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, extent_start + extent_len); cur_logical += fs_info->sectorsize) { const int nr_sector = (cur_logical - stripe->logical) >> fs_info->sectorsize_bits; struct scrub_sector_verification *sector = &stripe->sectors[nr_sector]; set_bit(nr_sector, &stripe->extent_sector_bitmap); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { sector->is_metadata = true; sector->generation = extent_gen; } } } static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { stripe->extent_sector_bitmap = 0; stripe->init_error_bitmap = 0; stripe->init_nr_io_errors = 0; stripe->init_nr_csum_errors = 0; stripe->init_nr_meta_errors = 0; stripe->error_bitmap = 0; stripe->io_error_bitmap = 0; stripe->csum_error_bitmap = 0; stripe->meta_error_bitmap = 0; } /* * Locate one stripe which has at least one extent in its range. * * Return 0 if found such stripe, and store its info into @stripe. * Return >0 if there is no such stripe in the specified range. * Return <0 for error. */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_path *extent_path, struct btrfs_path *csum_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = bg->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); const u64 logical_end = logical_start + logical_len; u64 cur_logical = logical_start; u64 stripe_end; u64 extent_start; u64 extent_len; u64 extent_flags; u64 extent_gen; int ret; memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); /* The range must be inside the bg. */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); ret = find_first_extent_item(extent_root, extent_path, logical_start, logical_len); /* Either error or not found. */ if (ret) goto out; get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) stripe->nr_data_extents++; cur_logical = max(extent_start, cur_logical); /* * Round down to stripe boundary. * * The extra calculation against bg->start is to handle block groups * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. */ stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + bg->start; stripe->physical = physical + stripe->logical - logical_start; stripe->dev = dev; stripe->bg = bg; stripe->mirror_num = mirror_num; stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; /* Fill the first extent info into stripe->sectors[] array. */ fill_one_extent_info(fs_info, stripe, extent_start, extent_len, extent_flags, extent_gen); cur_logical = extent_start + extent_len; /* Fill the extent info for the remaining sectors. */ while (cur_logical <= stripe_end) { ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) goto out; if (ret > 0) { ret = 0; break; } get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) stripe->nr_data_extents++; fill_one_extent_info(fs_info, stripe, extent_start, extent_len, extent_flags, extent_gen); cur_logical = extent_start + extent_len; } /* Now fill the data csum. */ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { int sector_nr; unsigned long csum_bitmap = 0; /* Csum space should have already been allocated. */ ASSERT(stripe->csums); /* * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN * should contain at most 16 sectors. */ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, stripe->logical, stripe_end, stripe->csums, &csum_bitmap); if (ret < 0) goto out; if (ret > 0) ret = 0; for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { stripe->sectors[sector_nr].csum = stripe->csums + sector_nr * fs_info->csum_size; } } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); out: return ret; } static void scrub_reset_stripe(struct scrub_stripe *stripe) { scrub_stripe_reset_bitmaps(stripe); stripe->nr_meta_extents = 0; stripe->nr_data_extents = 0; stripe->state = 0; for (int i = 0; i < stripe->nr_sectors; i++) { stripe->sectors[i].is_metadata = false; stripe->sectors[i].csum = NULL; stripe->sectors[i].generation = 0; } } static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + stripe->bg->length - stripe->logical) >> fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; atomic_inc(&stripe->pending_io); for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); /* We're beyond the chunk boundary, no need to read anymore. */ if (i >= nr_sectors) break; /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->extent_sector_bitmap)) || bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); btrfs_submit_bio(bbio, mirror); bbio = NULL; } if (!bbio) { struct btrfs_io_stripe io_stripe = {}; struct btrfs_io_context *bioc = NULL; const u64 logical = stripe->logical + (i << fs_info->sectorsize_bits); int err; bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, fs_info, scrub_read_endio, stripe); bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; io_stripe.is_scrub = true; err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); if (err) { btrfs_bio_end_io(bbio, errno_to_blk_status(err)); return; } } __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); } if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); btrfs_submit_bio(bbio, mirror); } if (atomic_dec_and_test(&stripe->pending_io)) { wake_up(&stripe->io_wait); INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); } } static void scrub_submit_initial_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + stripe->bg->length - stripe->logical) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); ASSERT(stripe->mirror_num > 0); ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { scrub_submit_extent_sector_read(sctx, stripe); return; } bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; /* Read the whole range inside the chunk boundary. */ for (unsigned int cur = 0; cur < nr_sectors; cur++) { struct page *page = scrub_stripe_get_page(stripe, cur); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); /* * For dev-replace, either user asks to avoid the source dev, or * the device is missing, we try the next mirror instead. */ if (sctx->is_dev_replace && (fs_info->dev_replace.cont_reading_from_srcdev_mode == BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || !stripe->dev->bdev)) { int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); mirror = calc_next_mirror(mirror, num_copies); } btrfs_submit_bio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) { int i; for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { if (stripe->sectors[i].is_metadata) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, "stripe %llu has unrepaired metadata sector at %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; } } return false; } static void submit_initial_group_read(struct scrub_ctx *sctx, unsigned int first_slot, unsigned int nr_stripes) { struct blk_plug plug; ASSERT(first_slot < SCRUB_TOTAL_STRIPES); ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, btrfs_stripe_nr_to_offset(nr_stripes)); blk_start_plug(&plug); for (int i = 0; i < nr_stripes; i++) { struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; /* Those stripes should be initialized. */ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); scrub_submit_initial_read(sctx, stripe); } blk_finish_plug(&plug); } static int flush_scrub_stripes(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct scrub_stripe *stripe; const int nr_stripes = sctx->cur_stripe; int ret = 0; if (!nr_stripes) return 0; ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); /* Submit the stripes which are populated but not submitted. */ if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); } for (int i = 0; i < nr_stripes; i++) { stripe = &sctx->stripes[i]; wait_event(stripe->repair_wait, test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); } /* Submit for dev-replace. */ if (sctx->is_dev_replace) { /* * For dev-replace, if we know there is something wrong with * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { if (stripe_has_metadata_error(&sctx->stripes[i])) { ret = -EIO; goto out; } } for (int i = 0; i < nr_stripes; i++) { unsigned long good; stripe = &sctx->stripes[i]; ASSERT(stripe->dev == fs_info->dev_replace.srcdev); bitmap_andnot(&good, &stripe->extent_sector_bitmap, &stripe->error_bitmap, stripe->nr_sectors); scrub_write_sectors(sctx, stripe, good, true); } } /* Wait for the above writebacks to finish. */ for (int i = 0; i < nr_stripes; i++) { stripe = &sctx->stripes[i]; wait_scrub_stripe_io(stripe); scrub_reset_stripe(stripe); } out: sctx->cur_stripe = 0; return ret; } static void raid56_scrub_wait_endio(struct bio *bio) { complete(bio->bi_private); } static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_device *dev, int mirror_num, u64 logical, u32 length, u64 physical, u64 *found_logical_ret) { struct scrub_stripe *stripe; int ret; /* * There should always be one slot left, as caller filling the last * slot should flush them all. */ ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); /* @found_logical_ret must be specified. */ ASSERT(found_logical_ret); stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, &sctx->csum_path, dev, physical, mirror_num, logical, length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; *found_logical_ret = stripe->logical; sctx->cur_stripe++; /* We filled one group, submit it. */ if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); } /* Last slot used, flush them all. */ if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) return flush_scrub_stripes(sctx); return 0; } static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, u64 full_stripe_start) { DECLARE_COMPLETION_ONSTACK(io_done); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; struct btrfs_path extent_path = { 0 }; struct btrfs_path csum_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); /* * For data stripe search, we cannot re-use the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ extent_path.search_commit_root = 1; extent_path.skip_locking = 1; csum_path.search_commit_root = 1; csum_path.skip_locking = 1; for (int i = 0; i < data_stripes; i++) { int stripe_index; int rot; u64 physical; stripe = &sctx->raid56_data_stripes[i]; rot = div_u64(full_stripe_start - bg->start, data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; stripe_index = (i + rot) % map->num_stripes; physical = map->stripes[stripe_index].physical + btrfs_stripe_nr_to_offset(rot); scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); if (ret < 0) goto out; /* * No extent in this data stripe, need to manually mark them * initialized to make later read submission happy. */ if (ret > 0) { stripe->logical = full_stripe_start + btrfs_stripe_nr_to_offset(i); stripe->dev = map->stripes[stripe_index].dev; stripe->mirror_num = 1; set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); } } /* Check if all data stripes are empty. */ for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { all_empty = false; break; } } if (all_empty) { ret = 0; goto out; } for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; scrub_submit_initial_read(sctx, stripe); } for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; wait_event(stripe->repair_wait, test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); } /* For now, no zoned support for RAID56. */ ASSERT(!btrfs_is_zoned(sctx->fs_info)); /* * Now all data stripes are properly verified. Check if we have any * unrepaired, if so abort immediately or we could further corrupt the * P/Q stripes. * * During the loop, also populate extent_bitmap. */ for (int i = 0; i < data_stripes; i++) { unsigned long error; stripe = &sctx->raid56_data_stripes[i]; /* * We should only check the errors where there is an extent. * As we may hit an empty data stripe while it's missing. */ bitmap_and(&error, &stripe->error_bitmap, &stripe->extent_sector_bitmap, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; goto out; } bitmap_or(&extent_bitmap, &extent_bitmap, &stripe->extent_sector_bitmap, stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; bio->bi_private = &io_done; bio->bi_end_io = raid56_scrub_wait_endio; btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, &length, &bioc, NULL, NULL); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); goto out; } rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); btrfs_put_bioc(bioc); if (!rbio) { ret = -ENOMEM; btrfs_bio_counter_dec(fs_info); goto out; } /* Use the recovered stripes as cache to avoid read them from disk again. */ for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; raid56_parity_cache_data_pages(rbio, stripe->pages, full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); } raid56_parity_submit_scrub_rbio(rbio); wait_for_completion_io(&io_done); ret = blk_status_to_errno(bio->bi_status); bio_put(bio); btrfs_bio_counter_dec(fs_info); btrfs_release_path(&extent_path); btrfs_release_path(&csum_path); out: return ret; } /* * Scrub one range which can only has simple mirror based profile. * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in * RAID0/RAID10). * * Since we may need to handle a subset of block group, we need @logical_start * and @logical_length parameter. */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) { struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; u64 cur_logical = logical_start; int ret; /* The range must be inside the bg */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || atomic_read(&sctx->cancel_req)) { ret = -ECANCELED; break; } /* Paused? */ if (atomic_read(&fs_info->scrub_pause_req)) { /* Push queued extents */ scrub_blocked_if_needed(fs_info); } /* Block group removed? */ spin_lock(&bg->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { spin_unlock(&bg->lock); ret = 0; break; } spin_unlock(&bg->lock); ret = queue_scrub_stripe(sctx, bg, device, mirror_num, cur_logical, logical_end - cur_logical, cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; ret = 0; break; } if (ret < 0) break; /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ ASSERT(found_logical != U64_MAX); cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ cond_resched(); } return ret; } /* Calculate the full stripe length for simple stripe based profiles */ static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); } /* Get the logical bytenr for the stripe */ static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, struct btrfs_block_group *bg, int stripe_index) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); ASSERT(stripe_index < map->num_stripes); /* * (stripe_index / sub_stripes) gives how many data stripes we need to * skip. */ return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + bg->start; } /* Get the mirror number for the stripe */ static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); ASSERT(stripe_index < map->num_stripes); /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ return stripe_index % map->sub_stripes + 1; } static int scrub_simple_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct btrfs_device *device, int stripe_index) { const u64 logical_increment = simple_stripe_full_stripe_len(map); const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); const u64 orig_physical = map->stripes[stripe_index].physical; const int mirror_num = simple_stripe_mirror_num(map, stripe_index); u64 cur_logical = orig_logical; u64 cur_physical = orig_physical; int ret = 0; while (cur_logical < bg->start + bg->length) { /* * Inside each stripe, RAID0 is just SINGLE, and RAID10 is * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ ret = scrub_simple_mirror(sctx, bg, map, cur_logical, BTRFS_STRIPE_LEN, device, cur_physical, mirror_num); if (ret) return ret; /* Skip to next stripe which belongs to the target device */ cur_logical += logical_increment; /* For physical offset, we just go to next stripe */ cur_physical += BTRFS_STRIPE_LEN; } return ret; } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct btrfs_device *scrub_dev, int stripe_index) { struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; int ret2; u64 physical = map->stripes[stripe_index].physical; const u64 dev_stripe_len = btrfs_calc_stripe_length(map); const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; /* The logical increment after finishing one stripe */ u64 increment; /* Offset inside the chunk */ u64 offset; u64 stripe_logical; int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); scrub_blocked_if_needed(fs_info); if (sctx->is_dev_replace && btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { mutex_lock(&sctx->wr_lock); sctx->write_pointer = physical; mutex_unlock(&sctx->wr_lock); } /* Prepare the extra data stripes used by RAID56. */ if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { ASSERT(sctx->raid56_data_stripes == NULL); sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), sizeof(struct scrub_stripe), GFP_KERNEL); if (!sctx->raid56_data_stripes) { ret = -ENOMEM; goto out; } for (int i = 0; i < nr_data_stripes(map); i++) { ret = init_scrub_stripe(fs_info, &sctx->raid56_data_stripes[i]); if (ret < 0) goto out; sctx->raid56_data_stripes[i].bg = bg; sctx->raid56_data_stripes[i].sctx = sctx; } } /* * There used to be a big double loop to handle all profiles using the * same routine, which grows larger and more gross over time. * * So here we handle each profile differently, so simpler profiles * have simpler scrubbing function. */ if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* * Above check rules out all complex profile, the remaining * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple * mirrored duplication without stripe. * * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; goto out; } if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); goto out; } /* Only RAID56 goes through the old code */ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); ret = 0; /* Calculate the logical end of the stripe */ get_raid56_logic_offset(physical_end, stripe_index, map, &logic_end, NULL); logic_end += chunk_logical; /* Initialize @offset in case we need to go to out: label */ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * Due to the rotation, for RAID56 it's better to iterate each stripe * using their physical offset. */ while (physical < physical_end) { ret = get_raid56_logic_offset(physical, stripe_index, map, &logical, &stripe_logical); logical += chunk_logical; if (ret) { /* it is parity strip */ stripe_logical += chunk_logical; ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, map, stripe_logical); if (ret) goto out; goto next; } /* * Now we're at a data stripe, scrub each extents in the range. * * At this stage, if we ignore the repair part, inside each data * stripe it is no different than SINGLE profile. * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; next: logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); if (stop_loop) sctx->stat.last_physical = map->stripes[stripe_index].physical + dev_stripe_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); if (stop_loop) break; } out: ret2 = flush_scrub_stripes(sctx); if (!ret) ret = ret2; btrfs_release_path(&sctx->extent_path); btrfs_release_path(&sctx->csum_path); if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) release_scrub_stripe(&sctx->raid56_data_stripes[i]); kfree(sctx->raid56_data_stripes); sctx->raid56_data_stripes = NULL; } if (sctx->is_dev_replace && ret >= 0) { int ret2; ret2 = sync_write_pointer_for_zoned(sctx, chunk_logical + offset, map->stripes[stripe_index].physical, physical_end); if (ret2) ret = ret2; } return ret < 0 ? ret : 0; } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_device *scrub_dev, u64 dev_offset, u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_chunk_map *map; int i; int ret = 0; map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); if (!map) { /* * Might have been an unused block group deleted by the cleaner * kthread or relocation. */ spin_lock(&bg->lock); if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) ret = -EINVAL; spin_unlock(&bg->lock); return ret; } if (map->start != bg->start) goto out; if (map->chunk_len < dev_extent_len) goto out; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, bg, map, scrub_dev, i); if (ret) goto out; } } out: btrfs_free_chunk_map(map); return ret; } static int finish_extent_writes_for_zoned(struct btrfs_root *root, struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; if (!btrfs_is_zoned(fs_info)) return 0; btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); return btrfs_commit_transaction(trans); } static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; u64 chunk_offset; int ret = 0; int ro_set; int slot; struct extent_buffer *l; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_block_group *cache; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; key.objectid = scrub_dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; while (1) { u64 dev_extent_len; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) break; if (ret > 0) { if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) break; if (ret > 0) { ret = 0; break; } } else { ret = 0; } } l = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(l, &found_key, slot); if (found_key.objectid != scrub_dev->devid) break; if (found_key.type != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) break; if (found_key.offset < key.offset) break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); dev_extent_len = btrfs_dev_extent_length(l, dev_extent); if (found_key.offset + dev_extent_len <= start) goto skip; chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); /* * get a reference on the corresponding block group to prevent * the chunk from going away while we scrub it */ cache = btrfs_lookup_block_group(fs_info, chunk_offset); /* some chunks are removed but not committed to disk yet, * continue scrubbing */ if (!cache) goto skip; ASSERT(cache->start <= chunk_offset); /* * We are using the commit root to search for device extents, so * that means we could have found a device extent item from a * block group that was deleted in the current transaction. The * logical start offset of the deleted block group, stored at * @chunk_offset, might be part of the logical address range of * a new block group (which uses different physical extents). * In this case btrfs_lookup_block_group() has returned the new * block group, and its start address is less than @chunk_offset. * * We skip such new block groups, because it's pointless to * process them, as we won't find their extents because we search * for them using the commit root of the extent tree. For a device * replace it's also fine to skip it, we won't miss copying them * to the target device because we have the write duplication * setup through the regular write path (by btrfs_map_block()), * and we have committed a transaction when we started the device * replace, right after setting up the device replace state. */ if (cache->start < chunk_offset) { btrfs_put_block_group(cache); goto skip; } if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { btrfs_put_block_group(cache); goto skip; } } /* * Make sure that while we are scrubbing the corresponding block * group doesn't get its logical address and its device extents * reused for another block group, which can possibly be of a * different type and different profile. We do this to prevent * false error detections and crashes due to bogus attempts to * repair extents. */ spin_lock(&cache->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); goto skip; } btrfs_freeze_block_group(cache); spin_unlock(&cache->lock); /* * we need call btrfs_inc_block_group_ro() with scrubs_paused, * to avoid deadlock caused by: * btrfs_inc_block_group_ro() * -> btrfs_wait_for_commit() * -> btrfs_commit_transaction() * -> btrfs_scrub_pause() */ scrub_pause_on(fs_info); /* * Don't do chunk preallocation for scrub. * * This is especially important for SYSTEM bgs, or we can hit * -EFBIG from btrfs_finish_chunk_alloc() like: * 1. The only SYSTEM bg is marked RO. * Since SYSTEM bg is small, that's pretty common. * 2. New SYSTEM bg will be allocated * Due to regular version will allocate new chunk. * 3. New SYSTEM bg is empty and will get cleaned up * Before cleanup really happens, it's marked RO again. * 4. Empty SYSTEM bg get scrubbed * We go back to 2. * * This can easily boost the amount of SYSTEM chunks if cleaner * thread can't be triggered fast enough, and use up all space * of btrfs_super_block::sys_chunk_array * * While for dev replace, we need to try our best to mark block * group RO, to prevent race between: * - Write duplication * Contains latest data * - Scrub copy * Contains data from commit tree * * If target block group is not marked RO, nocow writes can * be overwritten by scrub copy, causing data corruption. * So for dev-replace, it's not allowed to continue if a block * group is not RO. */ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); if (!ret && sctx->is_dev_replace) { ret = finish_extent_writes_for_zoned(root, cache); if (ret) { btrfs_dec_block_group_ro(cache); scrub_pause_off(fs_info); btrfs_put_block_group(cache); break; } } if (ret == 0) { ro_set = 1; } else if (ret == -ENOSPC && !sctx->is_dev_replace && !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. * * For RAID56 chunks, we have to mark them read-only * for scrub, as later we would use our own cache * out of RAID56 realm. * Thus we want the RAID56 bg to be marked RO to * prevent RMW from screwing up out cache. */ ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, "skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { btrfs_warn(fs_info, "failed setting block group ro: %d", ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); break; } /* * Now the target block is marked RO, wait for nocow writes to * finish before dev-replace. * COW is fine, as COW never overwrites extents in commit tree. */ if (sctx->is_dev_replace) { btrfs_wait_nocow_writers(cache); btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); } scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + dev_extent_len; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); if (sctx->is_dev_replace && !btrfs_finish_block_group_to_copy(dev_replace->srcdev, cache, found_key.offset)) ro_set = 0; down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); /* * We might have prevented the cleaner kthread from deleting * this block group if it was already unused because we raced * and set it to RO mode first. So add it back to the unused * list, otherwise it might not ever be deleted unless a manual * balance is triggered or it becomes used and unused again. */ spin_lock(&cache->lock); if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && !cache->ro && cache->reserved == 0 && cache->used == 0) { spin_unlock(&cache->lock); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_queue_work(&fs_info->discard_ctl, cache); else btrfs_mark_bg_unused(cache); } else { spin_unlock(&cache->lock); } skip_unfreeze: btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); if (ret) break; if (sctx->is_dev_replace && atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; } if (sctx->stat.malloc_errors > 0) { ret = -ENOMEM; break; } skip: key.offset = found_key.offset + dev_extent_len; btrfs_release_path(path); } btrfs_free_path(path); return ret; } static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, struct page *page, u64 physical, u64 generation) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio_vec bvec; struct bio bio; struct btrfs_super_block *sb = page_address(page); int ret; bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); ret = submit_bio_wait(&bio); bio_uninit(&bio); if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, "super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, "super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; } return btrfs_validate_super(fs_info, sb, -1); } static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev) { int i; u64 bytenr; u64 gen; int ret = 0; struct page *page; struct btrfs_fs_info *fs_info = sctx->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; page = alloc_page(GFP_KERNEL); if (!page) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); return -ENOMEM; } /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; else gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); if (ret) { spin_lock(&sctx->stat_lock); sctx->stat.super_errors++; spin_unlock(&sctx->stat_lock); } } __free_page(page); return 0; } static void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { struct workqueue_struct *scrub_workers = fs_info->scrub_workers; fs_info->scrub_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); } } /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { struct workqueue_struct *scrub_workers = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) return -ENOMEM; mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = scrub_workers; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; } /* Other thread raced in and created the workers for us */ refcount_inc(&fs_info->scrub_workers_refcnt); mutex_unlock(&fs_info->scrub_lock); ret = 0; destroy_workqueue(scrub_workers); return ret; } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; unsigned int nofs_flag; bool need_commit = false; if (btrfs_fs_closing(fs_info)) return -EAGAIN; /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); /* * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible * value (max nodesize / min sectorsize), thus nodesize should always * be fine. */ ASSERT(fs_info->nodesize <= SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); /* Allocate outside of device_list_mutex */ sctx = scrub_setup_ctx(fs_info, is_dev_replace); if (IS_ERR(sctx)) return PTR_ERR(sctx); ret = scrub_workers_get(fs_info); if (ret) goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -ENODEV; goto out; } if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub on devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; } mutex_lock(&fs_info->scrub_lock); if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; goto out; } down_read(&fs_info->dev_replace.rwsem); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { up_read(&fs_info->dev_replace.rwsem); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EINPROGRESS; goto out; } up_read(&fs_info->dev_replace.rwsem); sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* * checking @scrub_pause_req here, we can avoid * race between committing transaction and scrubbing. */ __scrub_blocked_if_needed(fs_info); atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); /* * In order to avoid deadlock with reclaim when there is a transaction * trying to pause scrub, make sure we use GFP_NOFS for all the * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() * invoked by our callees. The pausing request is done when the * transaction commit starts, and it blocks the transaction until scrub * is paused (done at specific points at scrub_stripe() or right above * before incrementing fs_info->scrubs_running). */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { u64 old_super_errors; spin_lock(&sctx->stat_lock); old_super_errors = sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can * kick off writing super in log tree sync. */ mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); spin_lock(&sctx->stat_lock); /* * Super block errors found, but we can not commit transaction * at current context, since btrfs_commit_transaction() needs * to pause the current running scrub (hold by ourselves). */ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) need_commit = true; spin_unlock(&sctx->stat_lock); } if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); memalloc_nofs_restore(nofs_flag); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); if (!is_dev_replace) btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", ret ? "not finished" : "finished", devid, ret); mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; mutex_unlock(&fs_info->scrub_lock); scrub_workers_put(fs_info); scrub_put_ctx(sctx); /* * We found some super block errors before, now try to force a * transaction commit, as scrub has finished. */ if (need_commit) { struct btrfs_trans_handle *trans; trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_err(fs_info, "scrub: failed to start transaction to fix super block errors: %d", ret); return ret; } ret = btrfs_commit_transaction(trans); if (ret < 0) btrfs_err(fs_info, "scrub: failed to commit transaction to fix super block errors: %d", ret); } return ret; out: scrub_workers_put(fs_info); out_free_ctx: scrub_free_ctx(sctx); return ret; } void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); atomic_inc(&fs_info->scrub_pause_req); while (atomic_read(&fs_info->scrubs_paused) != atomic_read(&fs_info->scrubs_running)) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, atomic_read(&fs_info->scrubs_paused) == atomic_read(&fs_info->scrubs_running)); mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); } void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) { atomic_dec(&fs_info->scrub_pause_req); wake_up(&fs_info->scrub_pause_wait); } int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } atomic_inc(&fs_info->scrub_cancel_req); while (atomic_read(&fs_info->scrubs_running)) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, atomic_read(&fs_info->scrubs_running) == 0); mutex_lock(&fs_info->scrub_lock); } atomic_dec(&fs_info->scrub_cancel_req); mutex_unlock(&fs_info->scrub_lock); return 0; } int btrfs_scrub_cancel_dev(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); sctx = dev->scrub_ctx; if (!sctx) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } atomic_inc(&sctx->cancel_req); while (dev->scrub_ctx) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, dev->scrub_ctx == NULL); mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); return 0; } int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct btrfs_device *dev; struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info->fs_devices, &args); if (dev) sctx = dev->scrub_ctx; if (sctx) memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; }
34 35 35 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 // SPDX-License-Identifier: GPL-2.0-only /* * x86 APERF/MPERF KHz calculation for * /sys/.../cpufreq/scaling_cur_freq * * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/sched/isolation.h> #include <linux/sched/topology.h> #include <linux/smp.h> #include <linux/syscore_ops.h> #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "cpu.h" struct aperfmperf { seqcount_t seq; unsigned long last_update; u64 acnt; u64 mcnt; u64 aperf; u64 mperf; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; static void init_counter_refs(void) { u64 aperf, mperf; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); } #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* * APERF/MPERF frequency ratio computation. * * The scheduler wants to do frequency invariant accounting and needs a <1 * ratio to account for the 'current' frequency, corresponding to * freq_curr / freq_max. * * Since the frequency freq_curr on x86 is controlled by micro-controller and * our P-state setting is little more than a request/hint, we need to observe * the effective frequency 'BusyMHz', i.e. the average frequency over a time * interval after discarding idle time. This is given by: * * BusyMHz = delta_APERF / delta_MPERF * freq_base * * where freq_base is the max non-turbo P-state. * * The freq_max term has to be set to a somewhat arbitrary value, because we * can't know which turbo states will be available at a given point in time: * it all depends on the thermal headroom of the entire package. We set it to * the turbo level with 4 cores active. * * Benchmarks show that's a good compromise between the 1C turbo ratio * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, * which would ignore the entire turbo range (a conspicuous part, making * freq_curr/freq_max always maxed out). * * An exception to the heuristic above is the Atom uarch, where we choose the * highest turbo level for freq_max since Atom's are generally oriented towards * power efficiency. * * Setting freq_max to anything less than the 1C turbo ratio makes the ratio * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; void arch_set_max_freq_ratio(bool turbo_disabled) { arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool __init turbo_disabled(void) { u64 misc_en; int err; err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ return true; } #define X86_MATCH(model) \ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { X86_MATCH(XEON_PHI_KNL), X86_MATCH(XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { X86_MATCH(SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { X86_MATCH(ATOM_GOLDMONT), X86_MATCH(ATOM_GOLDMONT_D), X86_MATCH(ATOM_GOLDMONT_PLUS), {} }; static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int num_delta_fratio) { int fratio, delta_fratio, found; int err, i; u64 msr; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; fratio = (msr >> 8) & 0xFF; i = 16; found = 0; do { if (found >= num_delta_fratio) { *turbo_freq = fratio; return true; } delta_fratio = (msr >> (i + 5)) & 0x7; if (delta_fratio) { found += 1; fratio -= delta_fratio; } i += 8; } while (i < 64); return true; } static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) { u64 ratios, counts; u32 group_size; int err, i; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; for (i = 0; i < 64; i += 8) { group_size = (counts >> i) & 0xFF; if (group_size >= size) { *turbo_freq = (ratios >> i) & 0xFF; return true; } } return false; } static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { u64 msr; int err; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ /* The CPU may have less than 4 cores */ if (!*turbo_freq) *turbo_freq = msr & 0xFF; /* 1C turbo */ return true; } static bool __init intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; if (x86_match_cpu(has_glm_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_knl_turbo_ratio_limits) && knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_skx_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) goto out; if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; return false; out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. * Some CPUs have turbo boost but don't declare any turbo ratio * in MSR_TURBO_RATIO_LIMIT. */ if (!base_freq || !turbo_freq) { pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); if (!turbo_ratio) { pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); return false; } arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); return true; } #ifdef CONFIG_PM_SLEEP static struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; static void register_freq_invariance_syscore_ops(void) { register_syscore_ops(&freq_invariance_syscore_ops); } #else static inline void register_freq_invariance_syscore_ops(void) {} #endif static void freq_invariance_enable(void) { if (static_branch_unlikely(&arch_scale_freq_key)) { WARN_ON_ONCE(1); return; } static_branch_enable(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { arch_turbo_freq_ratio = ratio; arch_set_max_freq_ratio(turbo_disabled); freq_invariance_enable(); } static void __init bp_init_freq_invariance(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; if (intel_set_max_freq_ratio()) freq_invariance_enable(); } static void disable_freq_invariance_workfn(struct work_struct *work) { int cpu; static_branch_disable(&arch_scale_freq_key); /* * Set arch_freq_scale to a default value on all cpus * This negates the effect of scaling */ for_each_possible_cpu(cpu) per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_tick(u64 acnt, u64 mcnt) { u64 freq_scale; if (!arch_scale_freq_invariant()) return; if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); if (!freq_scale) goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); return; error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } #else static inline void bp_init_freq_invariance(void) { } static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } #endif /* CONFIG_X86_64 && CONFIG_SMP */ void arch_scale_freq_tick(void) { struct aperfmperf *s = this_cpu_ptr(&cpu_samples); u64 acnt, mcnt, aperf, mperf; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; s->aperf = aperf; s->mperf = mperf; raw_write_seqcount_begin(&s->seq); s->last_update = jiffies; s->acnt = acnt; s->mcnt = mcnt; raw_write_seqcount_end(&s->seq); scale_freq_tick(acnt, mcnt); } /* * Discard samples older than the define maximum sample age of 20ms. There * is no point in sending IPIs in such a case. If the scheduler tick was * not running then the CPU is either idle or isolated. */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) unsigned int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; unsigned long last; u64 acnt, mcnt; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) goto fallback; do { seq = raw_read_seqcount_begin(&s->seq); last = s->last_update; acnt = s->acnt; mcnt = s->mcnt; } while (read_seqcount_retry(&s->seq, seq)); /* * Bail on invalid count and when the last update was too long ago, * which covers idle and NOHZ full CPUs. */ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) goto fallback; return div64_u64((cpu_khz * acnt), mcnt); fallback: freq = cpufreq_quick_get(cpu); return freq ? freq : cpu_khz; } static int __init bp_init_aperfmperf(void) { if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; init_counter_refs(); bp_init_freq_invariance(); return 0; } early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) init_counter_refs(); }
30 30 30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2019 Christoph Hellwig. */ #include "xfs.h" static inline unsigned int bio_max_vecs(unsigned int count) { return bio_max_segs(howmany(count, PAGE_SIZE)); } int xfs_rw_bdev( struct block_device *bdev, sector_t sector, unsigned int count, char *data, enum req_op op) { unsigned int is_vmalloc = is_vmalloc_addr(data); unsigned int left = count; int error; struct bio *bio; if (is_vmalloc && op == REQ_OP_WRITE) flush_kernel_vmap_range(data, count); bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, GFP_KERNEL); bio->bi_iter.bi_sector = sector; do { struct page *page = kmem_to_page(data); unsigned int off = offset_in_page(data); unsigned int len = min_t(unsigned, left, PAGE_SIZE - off); while (bio_add_page(bio, page, len, off) != len) { struct bio *prev = bio; bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_chain(prev, bio); submit_bio(prev); } data += len; left -= len; } while (left > 0); error = submit_bio_wait(bio); bio_put(bio); if (is_vmalloc && op == REQ_OP_READ) invalidate_kernel_vmap_range(data, count); return error; }
2 11 1 1 8 1 5 1 4 3 1 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> #include <linux/security.h> #include <linux/nospec.h> #include <net/sock.h> #include <uapi/linux/io_uring.h> #include <asm/ioctls.h> #include "io_uring.h" #include "rsrc.h" #include "uring_cmd.h" static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(cmd); struct io_ring_ctx *ctx = req->ctx; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) return; cmd->flags &= ~IORING_URING_CMD_CANCELABLE; io_ring_submit_lock(ctx, issue_flags); hlist_del(&req->hash_node); io_ring_submit_unlock(ctx, issue_flags); } /* * Mark this command as concelable, then io_uring_try_cancel_uring_cmd() * will try to cancel this issued command by sending ->uring_cmd() with * issue_flags of IO_URING_F_CANCEL. * * The command is guaranteed to not be done when calling ->uring_cmd() * with IO_URING_F_CANCEL, but it is driver's responsibility to deal * with race between io_uring canceling and normal completion. */ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(cmd); struct io_ring_ctx *ctx = req->ctx; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { cmd->flags |= IORING_URING_CMD_CANCELABLE; io_ring_submit_lock(ctx, issue_flags); hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd); io_ring_submit_unlock(ctx, issue_flags); } } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; ioucmd->task_work_cb(ioucmd, issue_flags); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned), unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; __io_req_task_work_add(req, flags); } EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) { req->big_cqe.extra1 = extra1; req->big_cqe.extra2 = extra2; } /* * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); io_uring_cmd_del_cancelable(ioucmd, issue_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else { struct io_tw_state ts = { .locked = !(issue_flags & IO_URING_F_UNLOCKED), }; io_req_task_complete(req, &ts); } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); int io_uring_cmd_prep_async(struct io_kiocb *req) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); ioucmd->sqe = req->async_data; return 0; } int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); if (sqe->__pad1) return -EINVAL; ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags); if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; if (ioucmd->flags & IORING_URING_CMD_FIXED) { struct io_ring_ctx *ctx = req->ctx; u16 index; req->buf_index = READ_ONCE(sqe->buf_index); if (unlikely(req->buf_index >= ctx->nr_user_bufs)) return -EFAULT; index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); } ioucmd->sqe = sqe; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return 0; } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_ring_ctx *ctx = req->ctx; struct file *file = req->file; int ret; if (!file->f_op->uring_cmd) return -EOPNOTSUPP; ret = security_uring_cmd(ioucmd); if (ret) return ret; if (ctx->flags & IORING_SETUP_SQE128) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & IORING_SETUP_CQE32) issue_flags |= IO_URING_F_CQE32; if (ctx->compat) issue_flags |= IO_URING_F_COMPAT; if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; } ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { if (!req_has_async_data(req)) { if (io_alloc_async_data(req)) return -ENOMEM; io_uring_cmd_prep_async(req); } return -EAGAIN; } if (ret != -EIOCBQUEUED) { if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return ret; } return IOU_ISSUE_SKIP_COMPLETE; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); return io_import_fixed(rw, iter, req->imu, ubuf, len); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) { bool compat = !!(issue_flags & IO_URING_F_COMPAT); int optlen, optname, level, err; void __user *optval; level = READ_ONCE(cmd->sqe->level); if (level != SOL_SOCKET) return -EOPNOTSUPP; optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); optname = READ_ONCE(cmd->sqe->optname); optlen = READ_ONCE(cmd->sqe->optlen); err = do_sock_getsockopt(sock, compat, level, optname, USER_SOCKPTR(optval), KERNEL_SOCKPTR(&optlen)); if (err) return err; /* On success, return optlen */ return optlen; } static inline int io_uring_cmd_setsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) { bool compat = !!(issue_flags & IO_URING_F_COMPAT); int optname, optlen, level; void __user *optval; sockptr_t optval_s; optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); optname = READ_ONCE(cmd->sqe->optname); optlen = READ_ONCE(cmd->sqe->optlen); level = READ_ONCE(cmd->sqe->level); optval_s = USER_SOCKPTR(optval); return do_sock_setsockopt(sock, compat, level, optname, optval_s, optlen); } #if defined(CONFIG_NET) int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; struct sock *sk = sock->sk; struct proto *prot = READ_ONCE(sk->sk_prot); int ret, arg = 0; if (!prot || !prot->ioctl) return -EOPNOTSUPP; switch (cmd->sqe->cmd_op) { case SOCKET_URING_OP_SIOCINQ: ret = prot->ioctl(sk, SIOCINQ, &arg); if (ret) return ret; return arg; case SOCKET_URING_OP_SIOCOUTQ: ret = prot->ioctl(sk, SIOCOUTQ, &arg); if (ret) return ret; return arg; case SOCKET_URING_OP_GETSOCKOPT: return io_uring_cmd_getsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_SETSOCKOPT: return io_uring_cmd_setsockopt(sock, cmd, issue_flags); default: return -EOPNOTSUPP; } } EXPORT_SYMBOL_GPL(io_uring_cmd_sock); #endif
47 28 4 14 1 1 28 1 1 13 13 2 42 1 21 30 2 3 16 35 21 52 27 12 1 14 14 2 2 11 13 13 13 16 16 27 14 1 12 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 // SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/utime.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <linux/filelock.h> static bool nsec_valid(long nsec) { if (nsec == UTIME_OMIT || nsec == UTIME_NOW) return true; return nsec >= 0 && nsec <= 999999999; } int vfs_utimes(const struct path *path, struct timespec64 *times) { int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; if (times) { if (!nsec_valid(times[0].tv_nsec) || !nsec_valid(times[1].tv_nsec)) return -EINVAL; if (times[0].tv_nsec == UTIME_NOW && times[1].tv_nsec == UTIME_NOW) times = NULL; } error = mnt_want_write(path->mnt); if (error) goto out; newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { newattrs.ia_atime = times[0]; newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { newattrs.ia_mtime = times[1]; newattrs.ia_valid |= ATTR_MTIME_SET; } /* * Tell setattr_prepare(), that this is an explicit time * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET * were used. */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); out: return error; } static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) { struct path path; int lookup_flags = 0, error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) return error; error = vfs_utimes(&path, times); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } static int do_utimes_fd(int fd, struct timespec64 *times, int flags) { struct fd f; int error; if (flags) return -EINVAL; f = fdget(fd); if (!f.file) return -EBADF; error = vfs_utimes(&f.file->f_path, times); fdput(f); return error; } /* * do_utimes - change times on filename or file descriptor * @dfd: open file descriptor, -1 or AT_FDCWD * @filename: path name or NULL * @times: new times or NULL * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) * * If filename is NULL and dfd refers to an open file, then operate on * the file. Otherwise look up filename, possibly using dfd as a * starting point. * * If times==NULL, set access and modification to current time, * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags) { if (filename == NULL && dfd != AT_FDCWD) return do_utimes_fd(dfd, times, flags); return do_utimes_path(dfd, filename, times, flags); } SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, struct __kernel_timespec __user *, utimes, int, flags) { struct timespec64 tstimes[2]; if (utimes) { if ((get_timespec64(&tstimes[0], &utimes[0]) || get_timespec64(&tstimes[1], &utimes[1]))) return -EFAULT; /* Nothing to do, we must not even check the path. */ if (tstimes[0].tv_nsec == UTIME_OMIT && tstimes[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME /* * futimesat(), utimes() and utime() are older versions of utimensat() * that are provided for compatibility with traditional C libraries. * On modern architectures, we always use libc wrappers around * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, struct __kernel_old_timeval __user *utimes) { struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { if (copy_from_user(&times, utimes, sizeof(times))) return -EFAULT; /* This test is needed to catch all invalid values. If we would test only in do_utimes we would miss those invalid values truncated by the multiplication with 1000. Note that we also catch UTIME_{NOW,OMIT} here which are only valid for utimensat. */ if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 || times[1].tv_usec >= 1000000 || times[1].tv_usec < 0) return -EINVAL; tstimes[0].tv_sec = times[0].tv_sec; tstimes[0].tv_nsec = 1000 * times[0].tv_usec; tstimes[1].tv_sec = times[1].tv_sec; tstimes[1].tv_nsec = 1000 * times[1].tv_usec; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { struct timespec64 tv[2]; if (times) { if (get_user(tv[0].tv_sec, &times->actime) || get_user(tv[1].tv_sec, &times->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ #ifdef __ARCH_WANT_SYS_UTIME32 SYSCALL_DEFINE2(utime32, const char __user *, filename, struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t->actime) || get_user(tv[1].tv_sec, &t->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } #endif SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; if (t) { if (get_old_timespec32(&tv[0], &t[0]) || get_old_timespec32(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, t ? tv : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME32 static long do_compat_futimesat(unsigned int dfd, const char __user *filename, struct old_timeval32 __user *t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_nsec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_nsec, &t[1].tv_usec)) return -EFAULT; if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) return -EINVAL; tv[0].tv_nsec *= 1000; tv[1].tv_nsec *= 1000; } return do_utimes(dfd, filename, t ? tv : NULL, 0); } SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } #endif #endif
45 130 129 84 7 6 48 47 75 60 95 95 95 13 83 95 95 69 13 81 37 8 72 72 51 48 3 51 8 48 1 2 51 5 14 45 129 5 130 52 4 52 4 51 51 52 52 52 21 52 6 21 4 48 34 34 34 34 248 7 248 2 6 4 72 72 3304 3308 3278 111 3306 77 72 72 2 6 5 77 77 77 77 77 77 126 92 14 14 31 30 25 7 7 2 3 7 32 32 12 12 12 3192 3194 8 8 8 1 8 8 81 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: * The group->recnt and mark->refcnt tell how many "things" in the kernel * currently are referencing the objects. Both kind of objects typically will * live inside the kernel with a refcnt of 2, one for its creation and one for * the reference a group and a mark hold to each other. * If you are holding the appropriate locks, you can take a reference and the * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: * There are 3 locks involved with fsnotify inode marks and they MUST be taken * in order as follows: * * group->mark_mutex * mark->lock * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private * data (i.e group limits). * mark->lock protects the marks attributes like its masks and flags. * Furthermore it protects the access to a reference of the group that the mark * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * * mark->connector->lock protects the list of marks anchored inside an * inode / vfsmount and each mark is hooked via the i_list. * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets * detached from fsnotify_mark_connector when last reference to the mark is * dropped. Thus having mark reference is enough to protect mark->connector * pointer and to make sure fsnotify_mark_connector cannot disappear. Also * because we remove mark from g_list before dropping mark reference associated * with that, any mark found through g_list is guaranteed to have * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) * - The inode is being evicted from cache. (fsnotify_inode_delete) * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> #include <linux/ratelimit.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; struct kmem_cache *fsnotify_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); static void fsnotify_connector_destroy_workfn(struct work_struct *work); static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { WARN_ON_ONCE(!refcount_read(&mark->refcnt)); refcount_inc(&mark->refcnt); } static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; return NULL; } __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) { if (WARN_ON(!fsnotify_valid_obj_type(conn->type))) return 0; return *fsnotify_conn_mask_p(conn); } static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); } /* * Grab or drop inode reference for the connector if needed. * * When it's time to drop the reference, we only clear the HAS_IREF flag and * return the inode object. fsnotify_drop_object() will be resonsible for doing * iput() outside of spinlocks. This happens when last mark that wanted iref is * detached. */ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, bool want_iref) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; struct inode *inode = NULL; if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == has_iref) return NULL; if (want_iref) { /* Pin inode if any mark wants inode refcount held */ fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; } else { /* Unpin inode after detach of last mark that wanted iref */ inode = fsnotify_conn_inode(conn); conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; } return inode; } static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) continue; new_mask |= fsnotify_calc_mask(mark); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) want_iref = true; } *fsnotify_conn_mask_p(conn) = new_mask; return fsnotify_update_iref(conn, want_iref); } /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { if (!conn) return; spin_lock(&conn->lock); __fsnotify_recalc_mask(conn); spin_unlock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) __fsnotify_update_child_dentry_flags( fsnotify_conn_inode(conn)); } /* Free all connectors queued for freeing once SRCU period ends */ static void fsnotify_connector_destroy_workfn(struct work_struct *work) { struct fsnotify_mark_connector *conn, *free; spin_lock(&destroy_lock); conn = connector_destroy_list; connector_destroy_list = NULL; spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); while (conn) { free = conn; conn = conn->destroy_next; kmem_cache_free(fsnotify_mark_connector_cachep, free); } } static void fsnotify_put_inode_ref(struct inode *inode) { struct super_block *sb = inode->i_sb; iput(inode); if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) wake_up_var(&sb->s_fsnotify_connectors); } static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) { struct super_block *sb = fsnotify_connector_sb(conn); if (sb) atomic_long_inc(&sb->s_fsnotify_connectors); } static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) { struct super_block *sb = fsnotify_connector_sb(conn); if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) wake_up_var(&sb->s_fsnotify_connectors); } static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { struct inode *inode = NULL; *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } fsnotify_put_sb_connectors(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; return inode; } static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; if (WARN_ON_ONCE(!group)) return; group->ops->free_mark(mark); fsnotify_put_group(group); } /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } /* * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { objp = __fsnotify_recalc_mask(conn); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); queue_work(system_unbound_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object * list. Mark can be already removed from the list by now and on its way to be * destroyed once SRCU period ends. * * Also pin the group so it doesn't disappear under us. */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { if (!mark) return true; if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { /* mark is attached, group is still alive then */ atomic_inc(&mark->group->user_waits); spin_unlock(&mark->lock); return true; } spin_unlock(&mark->lock); fsnotify_put_mark(mark); } return false; } /* * Puts marks and wakes up group destruction if necessary. * * Pairs with fsnotify_get_mark_safe() */ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) { if (mark) { struct fsnotify_group *group = mark->group; fsnotify_put_mark(mark); /* * We abuse notification_waitq on group shutdown for waiting for * all marks pinned when waiting for userspace. */ if (atomic_dec_and_test(&group->user_waits) && group->shutdown) wake_up(&group->notification_waitq); } } bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) __releases(&fsnotify_mark_srcu) { int type; fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); goto fail; } } /* * Now that both marks are pinned by refcount in the inode / vfsmount * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; fail: for (type--; type >= 0; type--) fsnotify_put_mark_wake(iter_info->marks[type]); return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) __acquires(&fsnotify_mark_srcu) { int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } /* * Mark mark as detached, remove it from group list. Mark still stays in object * list until its last reference is dropped. Note that we rely on mark being * removed from group list before corresponding reference to it is dropped. In * particular we rely on mark->connector being valid while we hold * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; list_del_init(&mark->g_list); spin_unlock(&mark->lock); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } /* * Free fsnotify mark. The mark is actually only marked as being freed. The * freeing is actually happening only once last reference to the mark is * dropped from a workqueue which first waits for srcu period end. * * Caller must have a reference to the mark or be protected by * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark * is being freed. */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { fsnotify_group_lock(group); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. * * Fanotify supports different notification classes (reflected as priority of * notification group). Events shall be passed to notification groups in * decreasing priority order. To achieve this marks in notification lists for * inodes and vfsmounts are sorted so that priorities of corresponding groups * are descending. * * Furthermore correct handling of the ignore mask requires processing inode * and vfsmount marks of each group together. Using the group address as * further sort criterion provides a unique sorting order and thus we can * merge inode and vfsmount lists of marks in linear time and find groups * present in both lists. * * A return value of 1 signifies that b has priority over a. * A return value of 0 signifies that the two marks have to be handled together. * A return value of -1 signifies that a has priority over b. */ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) { if (a == b) return 0; if (!a) return 1; if (!b) return -1; if (a->priority < b->priority) return 1; if (a->priority > b->priority) return -1; if (a < b) return 1; return -1; } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, unsigned int obj_type) { struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); if (!conn) return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->type = obj_type; conn->obj = connp; conn->flags = 0; fsnotify_get_sb_connectors(conn); /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } return 0; } /* * Get mark connector, make sure it is alive and return with its lock held. * This is for users that get connector pointer from inode or mount. Users that * hold reference to a mark on the list may directly lock connector->lock as * they are sure list cannot go away under them. */ static struct fsnotify_mark_connector *fsnotify_grab_connector( fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; int idx; idx = srcu_read_lock(&fsnotify_mark_srcu); conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (!conn) goto out; spin_lock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; } out: srcu_read_unlock(&fsnotify_mark_srcu, idx); return conn; } /* * Add mark into proper place in given list of marks. These marks may be used * for the fsnotify backend to determine which event types should be delivered * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); err = fsnotify_attach_connector_to_object(connp, obj_type); if (err) return err; goto restart; } /* is mark the first mark? */ if (hlist_empty(&conn->list)) { hlist_add_head_rcu(&mark->obj_list, &conn->list); goto added; } /* should mark be in the middle of the current list? */ hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone * seeing mark->connector set. */ WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); return err; } /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! * group->mark_mutex * mark->lock * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags); if (ret) goto err; fsnotify_recalc_mask(mark->connector); return ret; err: spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); fsnotify_put_mark(mark); return ret; } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, struct fsnotify_group *group) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; conn = fsnotify_grab_connector(connp); if (!conn) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->group == group && (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; } } spin_unlock(&conn->lock); return NULL; } EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our * to_free list so we have to use mark_mutex even when accessing that * list. And freeing mark requires us to drop mark_mutex. So we can * reliably free only the first mark in the list. That's why we first * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } fsnotify_group_unlock(group); clear: while (1) { fsnotify_group_lock(group); if (list_empty(head)) { fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } /* Destroy all marks attached to an object via connector */ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) return; /* * We have to be careful since we can race with e.g. * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the * list can get modified. However we are holding mark reference and * thus our mark cannot be removed from obj_list so we can continue * iteration after regaining conn->lock. */ hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); old_mark = mark; fsnotify_destroy_mark(mark, mark->group); spin_lock(&conn->lock); } /* * Detach list from object now so that we don't pin inode until all * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; WRITE_ONCE(mark->connector, NULL); } EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; spin_lock(&destroy_lock); /* exchange the list head */ list_replace_init(&destroy_list, &private_destroy_list); spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); fsnotify_final_mark_destroy(mark); } } /* Wait for all marks queued for destruction to be actually destroyed */ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
9 5 1 5 4 3 37 2 33 18 18 38 38 38 38 38 38 26 13 38 6 16 17 17 17 1 8 8 9 17 14 15 14 9 9 8 8 1 7 7 7 7 7 1 1 1 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV6 GSO/GRO offload support * Linux INET6 implementation */ #include <linux/kernel.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/printk.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> #include <net/gso.h> #include "ip6_offload.h" /* All GRO functions are always builtin, except UDP over ipv6, which lays in * ipv6 module, as it depends on UDPv6 lookup function, so we need special care * when ipv6 is built as a module */ #if IS_BUILTIN(CONFIG_IPV6) #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ }) static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { const struct net_offload *ops = NULL; struct ipv6_opt_hdr *opth; for (;;) { int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = skb_gro_header(skb, off + sizeof(*opth), off); if (unlikely(!opth)) break; len = ipv6_optlen(opth); opth = skb_gro_header(skb, off + len, off); if (unlikely(!opth)) break; proto = opth->nexthdr; off += len; } skb_gro_pull(skb, off - skb_network_offset(skb)); return proto; } static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; for (;;) { struct ipv6_opt_hdr *opth; int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; if (unlikely(!pskb_may_pull(skb, 8))) break; opth = (void *)skb->data; len = ipv6_optlen(opth); if (unlikely(!pskb_may_pull(skb, len))) break; opth = (void *)skb->data; proto = opth->nexthdr; __skb_pull(skb, len); } return proto; } static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; int proto, err; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; int offset = 0; bool encap, udpfrag; int nhoff; bool gso_partial; skb_reset_network_header(skb); err = ipv6_hopopt_jumbo_remove(skb); if (err) return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) udpfrag = proto == IPPROTO_UDP && encap && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); else payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) { kfree_skb_list(segs); return ERR_PTR(err); } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); } if (encap) skb_reset_inner_headers(skb); } out: return segs; } /* Return the total length of all the extension hdrs, following the same * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. */ static int ipv6_exthdrs_len(struct ipv6hdr *iph, const struct net_offload **opps) { struct ipv6_opt_hdr *opth = (void *)iph; int len = 0, proto, optlen = sizeof(*iph); proto = iph->nexthdr; for (;;) { *opps = rcu_dereference(inet6_offloads[proto]); if (unlikely(!(*opps))) break; if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; proto = opth->nexthdr; } return len; } INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; unsigned int hlen; unsigned int off; u16 flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; skb_set_network_header(skb, off); flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; iph = skb_gro_network_header(skb); } else { skb_gro_pull(skb, sizeof(*iph)); } skb_set_transport_header(skb, skb_gro_offset(skb)); NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_network_header_len(skb); list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || iph->nexthdr != iph2->nexthdr) { not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } if (unlikely(nlen > sizeof(struct ipv6hdr))) { if (memcmp(iph + 1, iph2 + 1, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } /* flush if Traffic Class fields are different */ NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) | (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); NAPI_GRO_CB(p)->flush |= flush; /* If the previous IP ID value was based on an atomic * datagram we can overwrite the value and ignore it. */ if (NAPI_GRO_CB(skb)->is_atomic) NAPI_GRO_CB(p)->flush_id = 0; } NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return ipv6_gro_receive(head, skb); } static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } payload_len = skb->len - nhoff - sizeof(*iph); if (unlikely(payload_len > IPV6_MAXPLEN)) { struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); /* Move network header left */ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), skb->transport_header - skb->mac_header); skb->data -= hoplen; skb->len += hoplen; skb->mac_header -= hoplen; skb->network_header -= hoplen; iph = (struct ipv6hdr *)(skb->data + nhoff); hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); /* Build hop-by-hop options */ hop_jumbo->nexthdr = iph->nexthdr; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); iph->nexthdr = NEXTHDR_HOP; iph->payload_len = 0; } else { iph = (struct ipv6hdr *)(skb->data + nhoff); iph->payload_len = htons(payload_len); } nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, udp6_gro_complete, skb, nhoff); out: return err; } static int sit_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return ipv6_gro_complete(skb, nhoff); } static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return ipv6_gro_complete(skb, nhoff); } static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return inet_gro_complete(skb, nhoff); } static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, }, }; static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static const struct net_offload sit_offload = { .callbacks = { .gso_segment = sit_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = sit_gro_complete, }, }; static const struct net_offload ip4ip6_offload = { .callbacks = { .gso_segment = ip4ip6_gso_segment, .gro_receive = ip4ip6_gro_receive, .gro_complete = ip4ip6_gro_complete, }, }; static const struct net_offload ip6ip6_offload = { .callbacks = { .gso_segment = ip6ip6_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = ip6ip6_gro_complete, }, }; static int __init ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); dev_add_offload(&ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP); return 0; } fs_initcall(ipv6_offload_init);
5 201 202 450 450 12 2 1 450 445 5 201 202 18 7 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <asm/ucontext.h> #include <asm/fpu/signal.h> #include <asm/sighandling.h> #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> /* * If regs->ss will cause an IRET fault, change it. Otherwise leave it * alone. Using this generally makes no sense unless * user_64bit_mode(regs) would return true. */ static void force_valid_ss(struct pt_regs *regs) { u32 ar; asm volatile ("lar %[old_ss], %[ar]\n\t" "jz 1f\n\t" /* If invalid: */ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ "1:" : [ar] "=r" (ar) : [old_ss] "rm" ((u16)regs->ss)); /* * For a valid 64-bit user context, we need DPL 3, type * read-write data or read-write exp-down data, and S and P * set. We can't use VERW because VERW doesn't check the * P bit. */ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) regs->ss = __USER_DS; } static bool restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, unsigned long uc_flags) { struct sigcontext sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) return false; regs->bx = sc.bx; regs->cx = sc.cx; regs->dx = sc.dx; regs->si = sc.si; regs->di = sc.di; regs->bp = sc.bp; regs->ax = sc.ax; regs->sp = sc.sp; regs->ip = sc.ip; regs->r8 = sc.r8; regs->r9 = sc.r9; regs->r10 = sc.r10; regs->r11 = sc.r11; regs->r12 = sc.r12; regs->r13 = sc.r13; regs->r14 = sc.r14; regs->r15 = sc.r15; /* Get CS/SS and force CPL3 */ regs->cs = sc.cs | 0x03; regs->ss = sc.ss | 0x03; regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); /* disable syscall checks */ regs->orig_ax = -1; /* * Fix up SS if needed for the benefit of old DOSEMU and * CRIU. */ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) force_valid_ss(regs); return fpu__restore_sig((void __user *)sc.fpstate, 0); } static __always_inline int __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { unsafe_put_user(regs->di, &sc->di, Efault); unsafe_put_user(regs->si, &sc->si, Efault); unsafe_put_user(regs->bp, &sc->bp, Efault); unsafe_put_user(regs->sp, &sc->sp, Efault); unsafe_put_user(regs->bx, &sc->bx, Efault); unsafe_put_user(regs->dx, &sc->dx, Efault); unsafe_put_user(regs->cx, &sc->cx, Efault); unsafe_put_user(regs->ax, &sc->ax, Efault); unsafe_put_user(regs->r8, &sc->r8, Efault); unsafe_put_user(regs->r9, &sc->r9, Efault); unsafe_put_user(regs->r10, &sc->r10, Efault); unsafe_put_user(regs->r11, &sc->r11, Efault); unsafe_put_user(regs->r12, &sc->r12, Efault); unsafe_put_user(regs->r13, &sc->r13, Efault); unsafe_put_user(regs->r14, &sc->r14, Efault); unsafe_put_user(regs->r15, &sc->r15, Efault); unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); unsafe_put_user(current->thread.error_code, &sc->err, Efault); unsafe_put_user(regs->ip, &sc->ip, Efault); unsafe_put_user(regs->flags, &sc->flags, Efault); unsafe_put_user(regs->cs, &sc->cs, Efault); unsafe_put_user(0, &sc->gs, Efault); unsafe_put_user(0, &sc->fs, Efault); unsafe_put_user(regs->ss, &sc->ss, Efault); unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); /* non-iBCS2 extensions.. */ unsafe_put_user(mask, &sc->oldmask, Efault); unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); return 0; Efault: return -EFAULT; } #define unsafe_put_sigcontext(sc, fp, regs, set, label) \ do { \ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ goto label; \ } while(0); #define unsafe_put_sigmask(set, frame, label) \ unsafe_put_user(*(__u64 *)(set), \ (__u64 __user *)&(frame)->uc.uc_sigmask, \ label) static unsigned long frame_uc_flags(struct pt_regs *regs) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_XSAVE)) flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; else flags = UC_SIGCONTEXT_SS; if (likely(user_64bit_mode(regs))) flags |= UC_STRICT_RESTORE_SS; return flags; } int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *set = sigmask_to_save(); struct rt_sigframe __user *frame; void __user *fp = NULL; unsigned long uc_flags; /* x86-64 should always use SA_RESTORER. */ if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; /* Create the ucontext. */ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); /* Set up to return from userspace. If provided, use a stub already in userspace. */ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); unsafe_put_sigmask(set, frame, Efault); user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } if (setup_signal_shadow_stack(ksig)) return -EFAULT; /* Set up registers for signal handler */ regs->di = ksig->sig; /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ regs->si = (unsigned long)&frame->info; regs->dx = (unsigned long)&frame->uc; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; /* * Set up the CS and SS registers to run signal handlers in * 64-bit mode, even if the handler happens to be interrupting * 32-bit or 16-bit code. * * SS is subtle. In 64-bit mode, we don't need any particular * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU * avoids relying on sigreturn to restore SS; instead it uses * a trampoline.) So we do our best: if the old SS was valid, * we keep it. Otherwise we replace it. */ regs->cs = __USER_CS; if (unlikely(regs->ss != __USER_DS)) force_valid_ss(regs); return 0; Efault: user_access_end(); return -EFAULT; } /* * Do a signal return; undo the signal stack. */ SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_signal_shadow_stack()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; return regs->ax; badframe: signal_fault(regs, frame, "rt_sigreturn"); return 0; } #ifdef CONFIG_X86_X32_ABI static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (from->si_signo == SIGCHLD) { new._sifields._sigchld_x32._utime = from->si_utime; new._sifields._sigchld_x32._stime = from->si_stime; } if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } int copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { if (in_x32_syscall()) return x32_copy_siginfo_to_user(to, from); return __copy_siginfo_to_user32(to, from); } int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); struct rt_sigframe_x32 __user *frame; unsigned long uc_flags; void __user *restorer; void __user *fp = NULL; if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); uc_flags = frame_uc_flags(regs); if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; /* Create the ucontext. */ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); unsafe_put_user(0, &frame->uc.uc__pad0, Efault); restorer = ksig->ka.sa.sa_restorer; unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); unsafe_put_sigmask(set, frame, Efault); user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* We use the x32 calling convention here... */ regs->di = ksig->sig; regs->si = (unsigned long) &frame->info; regs->dx = (unsigned long) &frame->uc; loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); regs->cs = __USER_CS; regs->ss = __USER_DS; return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return regs->ax; badframe: signal_fault(regs, frame, "x32 rt_sigreturn"); return 0; } #endif /* CONFIG_X86_X32_ABI */ #ifdef CONFIG_COMPAT void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { if (!act) return; if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) act->sa.sa_flags |= SA_X32_ABI; } #endif /* CONFIG_COMPAT */ /* * If adding a new si_code, there is probably new data in * the siginfo. Make sure folks bumping the si_code * limits also have to look at this code. Make sure any * new fields are handled in copy_siginfo_to_user32()! */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); static_assert(NSIGSYS == 2); /* This is part of the ABI and can never change in size: */ static_assert(sizeof(siginfo_t) == 128); /* This is a part of the ABI and can never change in alignment */ static_assert(__alignof__(siginfo_t) == 8); /* * The offsets of all the (unioned) si_fields are fixed * in the ABI, of course. Make sure none of them ever * move and are always at the beginning: */ static_assert(offsetof(siginfo_t, si_signo) == 0); static_assert(offsetof(siginfo_t, si_errno) == 4); static_assert(offsetof(siginfo_t, si_code) == 8); /* * Ensure that the size of each si_field never changes. * If it does, it is a sign that the * copy_siginfo_to_user32() code below needs to updated * along with the size in the CHECK_SI_SIZE(). * * We repeat this check for both the generic and compat * siginfos. * * Note: it is OK for these to grow as long as the whole * structure stays within the padding size (checked * above). */ #define CHECK_SI_OFFSET(name) \ static_assert(offsetof(siginfo_t, _sifields) == \ offsetof(siginfo_t, _sifields.name)) #define CHECK_SI_SIZE(name, size) \ static_assert(sizeof_field(siginfo_t, _sifields.name) == size) CHECK_SI_OFFSET(_kill); CHECK_SI_SIZE (_kill, 2*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); CHECK_SI_OFFSET(_timer); CHECK_SI_SIZE (_timer, 6*sizeof(int)); static_assert(offsetof(siginfo_t, si_tid) == 0x10); static_assert(offsetof(siginfo_t, si_overrun) == 0x14); static_assert(offsetof(siginfo_t, si_value) == 0x18); CHECK_SI_OFFSET(_rt); CHECK_SI_SIZE (_rt, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_value) == 0x18); CHECK_SI_OFFSET(_sigchld); CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_status) == 0x18); static_assert(offsetof(siginfo_t, si_utime) == 0x20); static_assert(offsetof(siginfo_t, si_stime) == 0x28); #ifdef CONFIG_X86_X32_ABI /* no _sigchld_x32 in the generic siginfo_t */ static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) == 7*sizeof(int)); static_assert(offsetof(compat_siginfo_t, _sifields) == offsetof(compat_siginfo_t, _sifields._sigchld_x32)); static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18); static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20); #endif CHECK_SI_OFFSET(_sigfault); CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); static_assert(offsetof(siginfo_t, si_addr) == 0x10); static_assert(offsetof(siginfo_t, si_trapno) == 0x18); static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); static_assert(offsetof(siginfo_t, si_lower) == 0x20); static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); CHECK_SI_OFFSET(_sigpoll); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); CHECK_SI_OFFSET(_sigsys); CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); static_assert(offsetof(siginfo_t, si_syscall) == 0x18); static_assert(offsetof(siginfo_t, si_arch) == 0x1C); /* any new si_fields should be added here */
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 // SPDX-License-Identifier: GPL-2.0-or-later /* * SPCA501 chip based cameras initialization data * * V4L2 by Jean-Francois Moine <http://moinejf.free.fr> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "spca501" #include "gspca.h" MODULE_AUTHOR("Michel Xhaard <mxhaard@users.sourceforge.net>"); MODULE_DESCRIPTION("GSPCA/SPCA501 USB Camera Driver"); MODULE_LICENSE("GPL"); /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ unsigned short contrast; __u8 brightness; __u8 colors; __u8 blue_balance; __u8 red_balance; char subtype; #define Arowana300KCMOSCamera 0 #define IntelCreateAndShare 1 #define KodakDVC325 2 #define MystFromOriUnknownCamera 3 #define SmileIntlCamera 4 #define ThreeComHomeConnectLite 5 #define ViewQuestM318B 6 }; static const struct v4l2_pix_format vga_mode[] = { {160, 120, V4L2_PIX_FMT_SPCA501, V4L2_FIELD_NONE, .bytesperline = 160, .sizeimage = 160 * 120 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 2}, {320, 240, V4L2_PIX_FMT_SPCA501, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, {640, 480, V4L2_PIX_FMT_SPCA501, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, }; #define SPCA50X_REG_USB 0x2 /* spca505 501 */ /* * Data to initialize a SPCA501. From a capture file provided by Bill Roehl * With SPCA501 chip description */ #define CCDSP_SET /* set CCDSP parameters */ #define TG_SET /* set time generator set */ #undef DSPWIN_SET /* set DSP windows parameters */ #undef ALTER_GAMA /* Set alternate set to YUV transform coeffs. */ #define SPCA501_SNAPBIT 0x80 #define SPCA501_SNAPCTRL 0x10 /* Frame packet header offsets for the spca501 */ #define SPCA501_OFFSET_GPIO 1 #define SPCA501_OFFSET_TYPE 2 #define SPCA501_OFFSET_TURN3A 3 #define SPCA501_OFFSET_FRAMSEQ 4 #define SPCA501_OFFSET_COMPRESS 5 #define SPCA501_OFFSET_QUANT 6 #define SPCA501_OFFSET_QUANT2 7 #define SPCA501_OFFSET_DATA 8 #define SPCA501_PROP_COMP_ENABLE(d) ((d) & 1) #define SPCA501_PROP_SNAP(d) ((d) & 0x40) #define SPCA501_PROP_SNAP_CTRL(d) ((d) & 0x10) #define SPCA501_PROP_COMP_THRESH(d) (((d) & 0x0e) >> 1) #define SPCA501_PROP_COMP_QUANT(d) (((d) & 0x70) >> 4) /* SPCA501 CCDSP control */ #define SPCA501_REG_CCDSP 0x01 /* SPCA501 control/status registers */ #define SPCA501_REG_CTLRL 0x02 /* registers for color correction and YUV transformation */ #define SPCA501_A11 0x08 #define SPCA501_A12 0x09 #define SPCA501_A13 0x0A #define SPCA501_A21 0x0B #define SPCA501_A22 0x0C #define SPCA501_A23 0x0D #define SPCA501_A31 0x0E #define SPCA501_A32 0x0F #define SPCA501_A33 0x10 /* Data for video camera initialization before capturing */ static const __u16 spca501_open_data[][3] = { /* bmRequest,value,index */ {0x2, 0x50, 0x00}, /* C/S enable soft reset */ {0x2, 0x40, 0x00}, /* C/S disable soft reset */ {0x2, 0x02, 0x05}, /* C/S general purpose I/O data */ {0x2, 0x03, 0x05}, /* C/S general purpose I/O data */ #ifdef CCDSP_SET {0x1, 0x38, 0x01}, /* CCDSP options */ {0x1, 0x05, 0x02}, /* CCDSP Optical black level for user settings */ {0x1, 0xC0, 0x03}, /* CCDSP Optical black settings */ {0x1, 0x67, 0x07}, {0x1, 0x63, 0x3f}, /* CCDSP CCD gamma enable */ {0x1, 0x03, 0x56}, /* Add gamma correction */ {0x1, 0xFF, 0x15}, /* CCDSP High luminance for white balance */ {0x1, 0x01, 0x16}, /* CCDSP Low luminance for white balance */ /* Color correction and RGB-to-YUV transformation coefficients changing */ #ifdef ALTER_GAMA {0x0, 0x00, 0x08}, /* A11 */ {0x0, 0x00, 0x09}, /* A12 */ {0x0, 0x90, 0x0A}, /* A13 */ {0x0, 0x12, 0x0B}, /* A21 */ {0x0, 0x00, 0x0C}, /* A22 */ {0x0, 0x00, 0x0D}, /* A23 */ {0x0, 0x00, 0x0E}, /* A31 */ {0x0, 0x02, 0x0F}, /* A32 */ {0x0, 0x00, 0x10}, /* A33 */ #else {0x1, 0x2a, 0x08}, /* A11 0x31 */ {0x1, 0xf8, 0x09}, /* A12 f8 */ {0x1, 0xf8, 0x0A}, /* A13 f8 */ {0x1, 0xf8, 0x0B}, /* A21 f8 */ {0x1, 0x14, 0x0C}, /* A22 0x14 */ {0x1, 0xf8, 0x0D}, /* A23 f8 */ {0x1, 0xf8, 0x0E}, /* A31 f8 */ {0x1, 0xf8, 0x0F}, /* A32 f8 */ {0x1, 0x20, 0x10}, /* A33 0x20 */ #endif {0x1, 0x00, 0x11}, /* R offset */ {0x1, 0x00, 0x12}, /* G offset */ {0x1, 0x00, 0x13}, /* B offset */ {0x1, 0x00, 0x14}, /* GB offset */ #endif #ifdef TG_SET /* Time generator manipulations */ {0x0, 0xfc, 0x0}, /* Set up high bits of shutter speed */ {0x0, 0x01, 0x1}, /* Set up low bits of shutter speed */ {0x0, 0xe4, 0x04}, /* DCLK*2 clock phase adjustment */ {0x0, 0x08, 0x05}, /* ADCK phase adjustment, inv. ext. VB */ {0x0, 0x03, 0x06}, /* FR phase adjustment */ {0x0, 0x01, 0x07}, /* FCDS phase adjustment */ {0x0, 0x39, 0x08}, /* FS phase adjustment */ {0x0, 0x88, 0x0a}, /* FH1 phase and delay adjustment */ {0x0, 0x03, 0x0f}, /* pixel identification */ {0x0, 0x00, 0x11}, /* clock source selection (default) */ /*VERY strange manipulations with * select DMCLP or OBPX to be ADCLP output (0x0C) * OPB always toggle or not (0x0D) but they allow * us to set up brightness */ {0x0, 0x01, 0x0c}, {0x0, 0xe0, 0x0d}, /* Done */ #endif #ifdef DSPWIN_SET {0x1, 0xa0, 0x01}, /* Setting image processing parameters */ {0x1, 0x1c, 0x17}, /* Changing Windows positions X1 */ {0x1, 0xe2, 0x19}, /* X2 */ {0x1, 0x1c, 0x1b}, /* X3 */ {0x1, 0xe2, 0x1d}, /* X4 */ {0x1, 0x5f, 0x1f}, /* X5 */ {0x1, 0x32, 0x20}, /* Y5 */ {0x1, 0x01, 0x10}, /* Changing A33 */ #endif {0x2, 0x204a, 0x07},/* Setting video compression & resolution 160x120 */ {0x2, 0x94, 0x06}, /* Setting video no compression */ {} }; /* The SPCAxxx docs from Sunplus document these values in tables, one table per register number. In the data below, dmRequest is the register number, index is the Addr, and value is a combination of Bit values. Bit Value (hex) 0 01 1 02 2 04 3 08 4 10 5 20 6 40 7 80 */ /* Data for chip initialization (set default values) */ static const __u16 spca501_init_data[][3] = { /* Set all the values to powerup defaults */ /* bmRequest,value,index */ {0x0, 0xAA, 0x00}, {0x0, 0x02, 0x01}, {0x0, 0x01, 0x02}, {0x0, 0x02, 0x03}, {0x0, 0xCE, 0x04}, {0x0, 0x00, 0x05}, {0x0, 0x00, 0x06}, {0x0, 0x00, 0x07}, {0x0, 0x00, 0x08}, {0x0, 0x00, 0x09}, {0x0, 0x90, 0x0A}, {0x0, 0x12, 0x0B}, {0x0, 0x00, 0x0C}, {0x0, 0x00, 0x0D}, {0x0, 0x00, 0x0E}, {0x0, 0x02, 0x0F}, {0x0, 0x00, 0x10}, {0x0, 0x00, 0x11}, {0x0, 0x00, 0x12}, {0x0, 0x00, 0x13}, {0x0, 0x00, 0x14}, {0x0, 0x00, 0x15}, {0x0, 0x00, 0x16}, {0x0, 0x00, 0x17}, {0x0, 0x00, 0x18}, {0x0, 0x00, 0x19}, {0x0, 0x00, 0x1A}, {0x0, 0x00, 0x1B}, {0x0, 0x00, 0x1C}, {0x0, 0x00, 0x1D}, {0x0, 0x00, 0x1E}, {0x0, 0x00, 0x1F}, {0x0, 0x00, 0x20}, {0x0, 0x00, 0x21}, {0x0, 0x00, 0x22}, {0x0, 0x00, 0x23}, {0x0, 0x00, 0x24}, {0x0, 0x00, 0x25}, {0x0, 0x00, 0x26}, {0x0, 0x00, 0x27}, {0x0, 0x00, 0x28}, {0x0, 0x00, 0x29}, {0x0, 0x00, 0x2A}, {0x0, 0x00, 0x2B}, {0x0, 0x00, 0x2C}, {0x0, 0x00, 0x2D}, {0x0, 0x00, 0x2E}, {0x0, 0x00, 0x2F}, {0x0, 0x00, 0x30}, {0x0, 0x00, 0x31}, {0x0, 0x00, 0x32}, {0x0, 0x00, 0x33}, {0x0, 0x00, 0x34}, {0x0, 0x00, 0x35}, {0x0, 0x00, 0x36}, {0x0, 0x00, 0x37}, {0x0, 0x00, 0x38}, {0x0, 0x00, 0x39}, {0x0, 0x00, 0x3A}, {0x0, 0x00, 0x3B}, {0x0, 0x00, 0x3C}, {0x0, 0x00, 0x3D}, {0x0, 0x00, 0x3E}, {0x0, 0x00, 0x3F}, {0x0, 0x00, 0x40}, {0x0, 0x00, 0x41}, {0x0, 0x00, 0x42}, {0x0, 0x00, 0x43}, {0x0, 0x00, 0x44}, {0x0, 0x00, 0x45}, {0x0, 0x00, 0x46}, {0x0, 0x00, 0x47}, {0x0, 0x00, 0x48}, {0x0, 0x00, 0x49}, {0x0, 0x00, 0x4A}, {0x0, 0x00, 0x4B}, {0x0, 0x00, 0x4C}, {0x0, 0x00, 0x4D}, {0x0, 0x00, 0x4E}, {0x0, 0x00, 0x4F}, {0x0, 0x00, 0x50}, {0x0, 0x00, 0x51}, {0x0, 0x00, 0x52}, {0x0, 0x00, 0x53}, {0x0, 0x00, 0x54}, {0x0, 0x00, 0x55}, {0x0, 0x00, 0x56}, {0x0, 0x00, 0x57}, {0x0, 0x00, 0x58}, {0x0, 0x00, 0x59}, {0x0, 0x00, 0x5A}, {0x0, 0x00, 0x5B}, {0x0, 0x00, 0x5C}, {0x0, 0x00, 0x5D}, {0x0, 0x00, 0x5E}, {0x0, 0x00, 0x5F}, {0x0, 0x00, 0x60}, {0x0, 0x00, 0x61}, {0x0, 0x00, 0x62}, {0x0, 0x00, 0x63}, {0x0, 0x00, 0x64}, {0x0, 0x00, 0x65}, {0x0, 0x00, 0x66}, {0x0, 0x00, 0x67}, {0x0, 0x00, 0x68}, {0x0, 0x00, 0x69}, {0x0, 0x00, 0x6A}, {0x0, 0x00, 0x6B}, {0x0, 0x00, 0x6C}, {0x0, 0x00, 0x6D}, {0x0, 0x00, 0x6E}, {0x0, 0x00, 0x6F}, {0x0, 0x00, 0x70}, {0x0, 0x00, 0x71}, {0x0, 0x00, 0x72}, {0x0, 0x00, 0x73}, {0x0, 0x00, 0x74}, {0x0, 0x00, 0x75}, {0x0, 0x00, 0x76}, {0x0, 0x00, 0x77}, {0x0, 0x00, 0x78}, {0x0, 0x00, 0x79}, {0x0, 0x00, 0x7A}, {0x0, 0x00, 0x7B}, {0x0, 0x00, 0x7C}, {0x0, 0x00, 0x7D}, {0x0, 0x00, 0x7E}, {0x0, 0x00, 0x7F}, {0x0, 0x00, 0x80}, {0x0, 0x00, 0x81}, {0x0, 0x00, 0x82}, {0x0, 0x00, 0x83}, {0x0, 0x00, 0x84}, {0x0, 0x00, 0x85}, {0x0, 0x00, 0x86}, {0x0, 0x00, 0x87}, {0x0, 0x00, 0x88}, {0x0, 0x00, 0x89}, {0x0, 0x00, 0x8A}, {0x0, 0x00, 0x8B}, {0x0, 0x00, 0x8C}, {0x0, 0x00, 0x8D}, {0x0, 0x00, 0x8E}, {0x0, 0x00, 0x8F}, {0x0, 0x00, 0x90}, {0x0, 0x00, 0x91}, {0x0, 0x00, 0x92}, {0x0, 0x00, 0x93}, {0x0, 0x00, 0x94}, {0x0, 0x00, 0x95}, {0x0, 0x00, 0x96}, {0x0, 0x00, 0x97}, {0x0, 0x00, 0x98}, {0x0, 0x00, 0x99}, {0x0, 0x00, 0x9A}, {0x0, 0x00, 0x9B}, {0x0, 0x00, 0x9C}, {0x0, 0x00, 0x9D}, {0x0, 0x00, 0x9E}, {0x0, 0x00, 0x9F}, {0x0, 0x00, 0xA0}, {0x0, 0x00, 0xA1}, {0x0, 0x00, 0xA2}, {0x0, 0x00, 0xA3}, {0x0, 0x00, 0xA4}, {0x0, 0x00, 0xA5}, {0x0, 0x00, 0xA6}, {0x0, 0x00, 0xA7}, {0x0, 0x00, 0xA8}, {0x0, 0x00, 0xA9}, {0x0, 0x00, 0xAA}, {0x0, 0x00, 0xAB}, {0x0, 0x00, 0xAC}, {0x0, 0x00, 0xAD}, {0x0, 0x00, 0xAE}, {0x0, 0x00, 0xAF}, {0x0, 0x00, 0xB0}, {0x0, 0x00, 0xB1}, {0x0, 0x00, 0xB2}, {0x0, 0x00, 0xB3}, {0x0, 0x00, 0xB4}, {0x0, 0x00, 0xB5}, {0x0, 0x00, 0xB6}, {0x0, 0x00, 0xB7}, {0x0, 0x00, 0xB8}, {0x0, 0x00, 0xB9}, {0x0, 0x00, 0xBA}, {0x0, 0x00, 0xBB}, {0x0, 0x00, 0xBC}, {0x0, 0x00, 0xBD}, {0x0, 0x00, 0xBE}, {0x0, 0x00, 0xBF}, {0x0, 0x00, 0xC0}, {0x0, 0x00, 0xC1}, {0x0, 0x00, 0xC2}, {0x0, 0x00, 0xC3}, {0x0, 0x00, 0xC4}, {0x0, 0x00, 0xC5}, {0x0, 0x00, 0xC6}, {0x0, 0x00, 0xC7}, {0x0, 0x00, 0xC8}, {0x0, 0x00, 0xC9}, {0x0, 0x00, 0xCA}, {0x0, 0x00, 0xCB}, {0x0, 0x00, 0xCC}, {0x1, 0xF4, 0x00}, {0x1, 0x38, 0x01}, {0x1, 0x40, 0x02}, {0x1, 0x0A, 0x03}, {0x1, 0x40, 0x04}, {0x1, 0x40, 0x05}, {0x1, 0x40, 0x06}, {0x1, 0x67, 0x07}, {0x1, 0x31, 0x08}, {0x1, 0x00, 0x09}, {0x1, 0x00, 0x0A}, {0x1, 0x00, 0x0B}, {0x1, 0x14, 0x0C}, {0x1, 0x00, 0x0D}, {0x1, 0x00, 0x0E}, {0x1, 0x00, 0x0F}, {0x1, 0x1E, 0x10}, {0x1, 0x00, 0x11}, {0x1, 0x00, 0x12}, {0x1, 0x00, 0x13}, {0x1, 0x00, 0x14}, {0x1, 0xFF, 0x15}, {0x1, 0x01, 0x16}, {0x1, 0x32, 0x17}, {0x1, 0x23, 0x18}, {0x1, 0xCE, 0x19}, {0x1, 0x23, 0x1A}, {0x1, 0x32, 0x1B}, {0x1, 0x8D, 0x1C}, {0x1, 0xCE, 0x1D}, {0x1, 0x8D, 0x1E}, {0x1, 0x00, 0x1F}, {0x1, 0x00, 0x20}, {0x1, 0xFF, 0x3E}, {0x1, 0x02, 0x3F}, {0x1, 0x00, 0x40}, {0x1, 0x00, 0x41}, {0x1, 0x00, 0x42}, {0x1, 0x00, 0x43}, {0x1, 0x00, 0x44}, {0x1, 0x00, 0x45}, {0x1, 0x00, 0x46}, {0x1, 0x00, 0x47}, {0x1, 0x00, 0x48}, {0x1, 0x00, 0x49}, {0x1, 0x00, 0x4A}, {0x1, 0x00, 0x4B}, {0x1, 0x00, 0x4C}, {0x1, 0x00, 0x4D}, {0x1, 0x00, 0x4E}, {0x1, 0x00, 0x4F}, {0x1, 0x00, 0x50}, {0x1, 0x00, 0x51}, {0x1, 0x00, 0x52}, {0x1, 0x00, 0x53}, {0x1, 0x00, 0x54}, {0x1, 0x00, 0x55}, {0x1, 0x00, 0x56}, {0x1, 0x00, 0x57}, {0x1, 0x00, 0x58}, {0x1, 0x00, 0x59}, {0x1, 0x00, 0x5A}, {0x2, 0x03, 0x00}, {0x2, 0x00, 0x01}, {0x2, 0x00, 0x05}, {0x2, 0x00, 0x06}, {0x2, 0x00, 0x07}, {0x2, 0x00, 0x10}, {0x2, 0x00, 0x11}, /* Strange - looks like the 501 driver doesn't do anything * at insert time except read the EEPROM */ {} }; /* Data for video camera init before capture. * Capture and decoding by Colin Peart. * This is for the 3com HomeConnect Lite which is spca501a based. */ static const __u16 spca501_3com_open_data[][3] = { /* bmRequest,value,index */ {0x2, 0x0050, 0x0000}, /* C/S Enable TG soft reset, timing mode=010 */ {0x2, 0x0043, 0x0000}, /* C/S Disable TG soft reset, timing mode=010 */ {0x2, 0x0002, 0x0005}, /* C/S GPIO */ {0x2, 0x0003, 0x0005}, /* C/S GPIO */ #ifdef CCDSP_SET {0x1, 0x0020, 0x0001}, /* CCDSP Options */ {0x1, 0x0020, 0x0002}, /* CCDSP Black Level */ {0x1, 0x006e, 0x0007}, /* CCDSP Gamma options */ {0x1, 0x0090, 0x0015}, /* CCDSP Luminance Low */ {0x1, 0x00ff, 0x0016}, /* CCDSP Luminance High */ {0x1, 0x0003, 0x003F}, /* CCDSP Gamma correction toggle */ #ifdef ALTER_GAMMA {0x1, 0x0010, 0x0008}, /* CCDSP YUV A11 */ {0x1, 0x0000, 0x0009}, /* CCDSP YUV A12 */ {0x1, 0x0000, 0x000a}, /* CCDSP YUV A13 */ {0x1, 0x0000, 0x000b}, /* CCDSP YUV A21 */ {0x1, 0x0010, 0x000c}, /* CCDSP YUV A22 */ {0x1, 0x0000, 0x000d}, /* CCDSP YUV A23 */ {0x1, 0x0000, 0x000e}, /* CCDSP YUV A31 */ {0x1, 0x0000, 0x000f}, /* CCDSP YUV A32 */ {0x1, 0x0010, 0x0010}, /* CCDSP YUV A33 */ {0x1, 0x0000, 0x0011}, /* CCDSP R Offset */ {0x1, 0x0000, 0x0012}, /* CCDSP G Offset */ {0x1, 0x0001, 0x0013}, /* CCDSP B Offset */ {0x1, 0x0001, 0x0014}, /* CCDSP BG Offset */ {0x1, 0x003f, 0x00C1}, /* CCDSP Gamma Correction Enable */ #endif #endif #ifdef TG_SET {0x0, 0x00fc, 0x0000}, /* TG Shutter Speed High Bits */ {0x0, 0x0000, 0x0001}, /* TG Shutter Speed Low Bits */ {0x0, 0x00e4, 0x0004}, /* TG DCLK*2 Adjust */ {0x0, 0x0008, 0x0005}, /* TG ADCK Adjust */ {0x0, 0x0003, 0x0006}, /* TG FR Phase Adjust */ {0x0, 0x0001, 0x0007}, /* TG FCDS Phase Adjust */ {0x0, 0x0039, 0x0008}, /* TG FS Phase Adjust */ {0x0, 0x0088, 0x000a}, /* TG MH1 */ {0x0, 0x0003, 0x000f}, /* TG Pixel ID */ /* Like below, unexplained toglleing */ {0x0, 0x0080, 0x000c}, {0x0, 0x0000, 0x000d}, {0x0, 0x0080, 0x000c}, {0x0, 0x0004, 0x000d}, {0x0, 0x0000, 0x000c}, {0x0, 0x0000, 0x000d}, {0x0, 0x0040, 0x000c}, {0x0, 0x0017, 0x000d}, {0x0, 0x00c0, 0x000c}, {0x0, 0x0000, 0x000d}, {0x0, 0x0080, 0x000c}, {0x0, 0x0006, 0x000d}, {0x0, 0x0080, 0x000c}, {0x0, 0x0004, 0x000d}, {0x0, 0x0002, 0x0003}, #endif #ifdef DSPWIN_SET {0x1, 0x001c, 0x0017}, /* CCDSP W1 Start X */ {0x1, 0x00e2, 0x0019}, /* CCDSP W2 Start X */ {0x1, 0x001c, 0x001b}, /* CCDSP W3 Start X */ {0x1, 0x00e2, 0x001d}, /* CCDSP W4 Start X */ {0x1, 0x00aa, 0x001f}, /* CCDSP W5 Start X */ {0x1, 0x0070, 0x0020}, /* CCDSP W5 Start Y */ #endif {0x0, 0x0001, 0x0010}, /* TG Start Clock */ /* {0x2, 0x006a, 0x0001}, * C/S Enable ISOSYNCH Packet Engine */ {0x2, 0x0068, 0x0001}, /* C/S Disable ISOSYNCH Packet Engine */ {0x2, 0x0000, 0x0005}, {0x2, 0x0043, 0x0000}, /* C/S Set Timing Mode, Disable TG soft reset */ {0x2, 0x0043, 0x0000}, /* C/S Set Timing Mode, Disable TG soft reset */ {0x2, 0x0002, 0x0005}, /* C/S GPIO */ {0x2, 0x0003, 0x0005}, /* C/S GPIO */ {0x2, 0x006a, 0x0001}, /* C/S Enable ISOSYNCH Packet Engine */ {} }; /* * Data used to initialize a SPCA501C with HV7131B sensor. * From a capture file taken with USBSnoop v 1.5 * I have a "SPCA501C pc camera chipset" manual by sunplus, but some * of the value meanings are obscure or simply "reserved". * to do list: * 1) Understand what every value means * 2) Understand why some values seem to appear more than once * 3) Write a small comment for each line of the following arrays. */ static const __u16 spca501c_arowana_open_data[][3] = { /* bmRequest,value,index */ {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x01, 0x0006, 0x0011}, {0x01, 0x00ff, 0x0012}, {0x01, 0x0014, 0x0013}, {0x01, 0x0000, 0x0014}, {0x01, 0x0042, 0x0051}, {0x01, 0x0040, 0x0052}, {0x01, 0x0051, 0x0053}, {0x01, 0x0040, 0x0054}, {0x01, 0x0000, 0x0055}, {0x00, 0x0025, 0x0000}, {0x00, 0x0026, 0x0000}, {0x00, 0x0001, 0x0000}, {0x00, 0x0027, 0x0000}, {0x00, 0x008a, 0x0000}, {} }; static const __u16 spca501c_arowana_init_data[][3] = { /* bmRequest,value,index */ {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x01, 0x0006, 0x0011}, {0x01, 0x00ff, 0x0012}, {0x01, 0x0014, 0x0013}, {0x01, 0x0000, 0x0014}, {0x01, 0x0042, 0x0051}, {0x01, 0x0040, 0x0052}, {0x01, 0x0051, 0x0053}, {0x01, 0x0040, 0x0054}, {0x01, 0x0000, 0x0055}, {0x00, 0x0025, 0x0000}, {0x00, 0x0026, 0x0000}, {0x00, 0x0001, 0x0000}, {0x00, 0x0027, 0x0000}, {0x00, 0x008a, 0x0000}, {0x02, 0x0000, 0x0005}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x000c, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0000, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, {0x00, 0x0000, 0x0024}, {0x00, 0x00d5, 0x0025}, {0x00, 0x0000, 0x0026}, {0x00, 0x000b, 0x0027}, {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, {0xff, 0x0000, 0x00d0}, {0xff, 0x00d8, 0x00d1}, {0xff, 0x0000, 0x00d4}, {0xff, 0x0000, 0x00d5}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0x00fd, 0x000a}, {0x01, 0x0038, 0x000b}, {0x01, 0x00d1, 0x000c}, {0x01, 0x00f7, 0x000d}, {0x01, 0x00ed, 0x000e}, {0x01, 0x00d8, 0x000f}, {0x01, 0x0038, 0x0010}, {0x01, 0x00ff, 0x0015}, {0x01, 0x0001, 0x0016}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, {0x01, 0x00ff, 0x003e}, {0x01, 0x0003, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0001, 0x0056}, {0x01, 0x0060, 0x0057}, {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x100a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc002, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x000c, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0000, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, {0x00, 0x0000, 0x0024}, {0x00, 0x00d5, 0x0025}, {0x00, 0x0000, 0x0026}, {0x00, 0x000b, 0x0027}, {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, {0xff, 0x0000, 0x00d0}, {0xff, 0x00d8, 0x00d1}, {0xff, 0x0000, 0x00d4}, {0xff, 0x0000, 0x00d5}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0x00fd, 0x000a}, {0x01, 0x0038, 0x000b}, {0x01, 0x00d1, 0x000c}, {0x01, 0x00f7, 0x000d}, {0x01, 0x00ed, 0x000e}, {0x01, 0x00d8, 0x000f}, {0x01, 0x0038, 0x0010}, {0x01, 0x00ff, 0x0015}, {0x01, 0x0001, 0x0016}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, {0x01, 0x00ff, 0x003e}, {0x01, 0x0003, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0001, 0x0056}, {0x01, 0x0060, 0x0057}, {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x100a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc002, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0003, 0x003f}, {0x01, 0x0001, 0x0056}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0xfffd, 0x000a}, {0x01, 0x0023, 0x000b}, {0x01, 0xffea, 0x000c}, {0x01, 0xfff4, 0x000d}, {0x01, 0xfffc, 0x000e}, {0x01, 0xffe3, 0x000f}, {0x01, 0x001f, 0x0010}, {0x01, 0x00a8, 0x0001}, {0x01, 0x0067, 0x0007}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x00c8, 0x0015}, {0x01, 0x0032, 0x0016}, {0x01, 0x0000, 0x0011}, {0x01, 0x0000, 0x0012}, {0x01, 0x0000, 0x0013}, {0x01, 0x000a, 0x0003}, {0x02, 0xc002, 0x0001}, {0x02, 0x0007, 0x0005}, {0x02, 0xc000, 0x0001}, {0x02, 0x0000, 0x0005}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x000c, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0000, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, {0x00, 0x0000, 0x0024}, {0x00, 0x00d5, 0x0025}, {0x00, 0x0000, 0x0026}, {0x00, 0x000b, 0x0027}, {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, {0xff, 0x0000, 0x00d0}, {0xff, 0x00d8, 0x00d1}, {0xff, 0x0000, 0x00d4}, {0xff, 0x0000, 0x00d5}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0x00fd, 0x000a}, {0x01, 0x0038, 0x000b}, {0x01, 0x00d1, 0x000c}, {0x01, 0x00f7, 0x000d}, {0x01, 0x00ed, 0x000e}, {0x01, 0x00d8, 0x000f}, {0x01, 0x0038, 0x0010}, {0x01, 0x00ff, 0x0015}, {0x01, 0x0001, 0x0016}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, {0x01, 0x00ff, 0x003e}, {0x01, 0x0003, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0001, 0x0056}, {0x01, 0x0060, 0x0057}, {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x100a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc002, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x000c, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0000, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, {0x00, 0x0000, 0x0024}, {0x00, 0x00d5, 0x0025}, {0x00, 0x0000, 0x0026}, {0x00, 0x000b, 0x0027}, {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, {0xff, 0x0000, 0x00d0}, {0xff, 0x00d8, 0x00d1}, {0xff, 0x0000, 0x00d4}, {0xff, 0x0000, 0x00d5}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0x00fd, 0x000a}, {0x01, 0x0038, 0x000b}, {0x01, 0x00d1, 0x000c}, {0x01, 0x00f7, 0x000d}, {0x01, 0x00ed, 0x000e}, {0x01, 0x00d8, 0x000f}, {0x01, 0x0038, 0x0010}, {0x01, 0x00ff, 0x0015}, {0x01, 0x0001, 0x0016}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, {0x01, 0x00ff, 0x003e}, {0x01, 0x0003, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0001, 0x0056}, {0x01, 0x0060, 0x0057}, {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x100a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc002, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x000f, 0x0000}, {0x01, 0x0003, 0x003f}, {0x01, 0x0001, 0x0056}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0xfffd, 0x000a}, {0x01, 0x0023, 0x000b}, {0x01, 0xffea, 0x000c}, {0x01, 0xfff4, 0x000d}, {0x01, 0xfffc, 0x000e}, {0x01, 0xffe3, 0x000f}, {0x01, 0x001f, 0x0010}, {0x01, 0x00a8, 0x0001}, {0x01, 0x0067, 0x0007}, {0x01, 0x0042, 0x0051}, {0x01, 0x0051, 0x0053}, {0x01, 0x000a, 0x0003}, {0x02, 0xc002, 0x0001}, {0x02, 0x0007, 0x0005}, {0x02, 0xc000, 0x0001}, {0x02, 0x0000, 0x0005}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x000c, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0000, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, {0x00, 0x0000, 0x0024}, {0x00, 0x00d5, 0x0025}, {0x00, 0x0000, 0x0026}, {0x00, 0x000b, 0x0027}, {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, {0xff, 0x0000, 0x00d0}, {0xff, 0x00d8, 0x00d1}, {0xff, 0x0000, 0x00d4}, {0xff, 0x0000, 0x00d5}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0x00fd, 0x000a}, {0x01, 0x0038, 0x000b}, {0x01, 0x00d1, 0x000c}, {0x01, 0x00f7, 0x000d}, {0x01, 0x00ed, 0x000e}, {0x01, 0x00d8, 0x000f}, {0x01, 0x0038, 0x0010}, {0x01, 0x00ff, 0x0015}, {0x01, 0x0001, 0x0016}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, {0x01, 0x00ff, 0x003e}, {0x01, 0x0003, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0001, 0x0056}, {0x01, 0x0060, 0x0057}, {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x100a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc002, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x000c, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0000, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, {0x00, 0x0000, 0x0024}, {0x00, 0x00d5, 0x0025}, {0x00, 0x0000, 0x0026}, {0x00, 0x000b, 0x0027}, {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, {0xff, 0x0000, 0x00d0}, {0xff, 0x00d8, 0x00d1}, {0xff, 0x0000, 0x00d4}, {0xff, 0x0000, 0x00d5}, {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0x00fd, 0x000a}, {0x01, 0x0038, 0x000b}, {0x01, 0x00d1, 0x000c}, {0x01, 0x00f7, 0x000d}, {0x01, 0x00ed, 0x000e}, {0x01, 0x00d8, 0x000f}, {0x01, 0x0038, 0x0010}, {0x01, 0x00ff, 0x0015}, {0x01, 0x0001, 0x0016}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, {0x01, 0x00ff, 0x003e}, {0x01, 0x0003, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0001, 0x0056}, {0x01, 0x0060, 0x0057}, {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x100a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc002, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x001e, 0x0000}, {0x01, 0x0003, 0x003f}, {0x01, 0x0001, 0x0056}, {0x01, 0x0011, 0x0008}, {0x01, 0x0032, 0x0009}, {0x01, 0xfffd, 0x000a}, {0x01, 0x0023, 0x000b}, {0x01, 0xffea, 0x000c}, {0x01, 0xfff4, 0x000d}, {0x01, 0xfffc, 0x000e}, {0x01, 0xffe3, 0x000f}, {0x01, 0x001f, 0x0010}, {0x01, 0x00a8, 0x0001}, {0x01, 0x0067, 0x0007}, {0x01, 0x0042, 0x0051}, {0x01, 0x0051, 0x0053}, {0x01, 0x000a, 0x0003}, {0x02, 0xc002, 0x0001}, {0x02, 0x0007, 0x0005}, {0x01, 0x0042, 0x0051}, {0x01, 0x0051, 0x0053}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x002d, 0x0000}, {0x01, 0x0003, 0x003f}, {0x01, 0x0001, 0x0056}, {0x02, 0xc000, 0x0001}, {0x02, 0x0000, 0x0005}, {} }; /* Unknown camera from Ori Usbid 0x0000:0x0000 */ /* Based on snoops from Ori Cohen */ static const __u16 spca501c_mysterious_open_data[][3] = { {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, /* DSP Registers */ {0x01, 0x0016, 0x0011}, /* RGB offset */ {0x01, 0x0000, 0x0012}, {0x01, 0x0006, 0x0013}, {0x01, 0x0078, 0x0051}, {0x01, 0x0040, 0x0052}, {0x01, 0x0046, 0x0053}, {0x01, 0x0040, 0x0054}, {0x00, 0x0025, 0x0000}, /* {0x00, 0x0000, 0x0000 }, */ /* Part 2 */ /* TG Registers */ {0x00, 0x0026, 0x0000}, {0x00, 0x0001, 0x0000}, {0x00, 0x0027, 0x0000}, {0x00, 0x008a, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x2000, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0015, 0x0001}, {0x05, 0x00ea, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0023, 0x0001}, {0x05, 0x0003, 0x0000}, {0x05, 0x0030, 0x0001}, {0x05, 0x002b, 0x0000}, {0x05, 0x0031, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0032, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0033, 0x0001}, {0x05, 0x0023, 0x0000}, {0x05, 0x0034, 0x0001}, {0x05, 0x0002, 0x0000}, {0x05, 0x0050, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0051, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0052, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0054, 0x0001}, {0x05, 0x0001, 0x0000}, {} }; /* Based on snoops from Ori Cohen */ static const __u16 spca501c_mysterious_init_data[][3] = { /* Part 3 */ /* TG registers */ /* {0x00, 0x0000, 0x0000}, */ {0x00, 0x0000, 0x0001}, {0x00, 0x0000, 0x0002}, {0x00, 0x0006, 0x0003}, {0x00, 0x0000, 0x0004}, {0x00, 0x0090, 0x0005}, {0x00, 0x0000, 0x0006}, {0x00, 0x0040, 0x0007}, {0x00, 0x00c0, 0x0008}, {0x00, 0x004a, 0x0009}, {0x00, 0x0000, 0x000a}, {0x00, 0x0000, 0x000b}, {0x00, 0x0001, 0x000c}, {0x00, 0x0001, 0x000d}, {0x00, 0x0000, 0x000e}, {0x00, 0x0002, 0x000f}, {0x00, 0x0001, 0x0010}, {0x00, 0x0000, 0x0011}, {0x00, 0x0001, 0x0012}, {0x00, 0x0002, 0x0020}, {0x00, 0x0080, 0x0021}, /* 640 */ {0x00, 0x0001, 0x0022}, {0x00, 0x00e0, 0x0023}, /* 480 */ {0x00, 0x0000, 0x0024}, /* Offset H hight */ {0x00, 0x00d3, 0x0025}, /* low */ {0x00, 0x0000, 0x0026}, /* Offset V */ {0x00, 0x000d, 0x0027}, /* low */ {0x00, 0x0000, 0x0046}, {0x00, 0x0000, 0x0047}, {0x00, 0x0000, 0x0048}, {0x00, 0x0000, 0x0049}, {0x00, 0x0008, 0x004a}, /* DSP Registers */ {0x01, 0x00a6, 0x0000}, {0x01, 0x0028, 0x0001}, {0x01, 0x0000, 0x0002}, {0x01, 0x000a, 0x0003}, /* Level Calc bit7 ->1 Auto */ {0x01, 0x0040, 0x0004}, {0x01, 0x0066, 0x0007}, {0x01, 0x000f, 0x0008}, /* A11 Color correction coeff */ {0x01, 0x002d, 0x0009}, /* A12 */ {0x01, 0x0005, 0x000a}, /* A13 */ {0x01, 0x0023, 0x000b}, /* A21 */ {0x01, 0x00e0, 0x000c}, /* A22 */ {0x01, 0x00fd, 0x000d}, /* A23 */ {0x01, 0x00f4, 0x000e}, /* A31 */ {0x01, 0x00e4, 0x000f}, /* A32 */ {0x01, 0x0028, 0x0010}, /* A33 */ {0x01, 0x00ff, 0x0015}, /* Reserved */ {0x01, 0x0001, 0x0016}, /* Reserved */ {0x01, 0x0032, 0x0017}, /* Win1 Start begin */ {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x0000, 0x001f}, {0x01, 0x0000, 0x0020}, /* Win1 Start end */ {0x01, 0x00ff, 0x003e}, /* Reserved begin */ {0x01, 0x0002, 0x003f}, {0x01, 0x0000, 0x0040}, {0x01, 0x0035, 0x0041}, {0x01, 0x0053, 0x0042}, {0x01, 0x0069, 0x0043}, {0x01, 0x007c, 0x0044}, {0x01, 0x008c, 0x0045}, {0x01, 0x009a, 0x0046}, {0x01, 0x00a8, 0x0047}, {0x01, 0x00b4, 0x0048}, {0x01, 0x00bf, 0x0049}, {0x01, 0x00ca, 0x004a}, {0x01, 0x00d4, 0x004b}, {0x01, 0x00dd, 0x004c}, {0x01, 0x00e7, 0x004d}, {0x01, 0x00ef, 0x004e}, {0x01, 0x00f8, 0x004f}, {0x01, 0x00ff, 0x0050}, {0x01, 0x0003, 0x0056}, /* Reserved end */ {0x01, 0x0060, 0x0057}, /* Edge Gain */ {0x01, 0x0040, 0x0058}, {0x01, 0x0011, 0x0059}, /* Edge Bandwidth */ {0x01, 0x0001, 0x005a}, {0x02, 0x0007, 0x0005}, {0x02, 0xa048, 0x0000}, {0x02, 0x0007, 0x0005}, {0x02, 0x0015, 0x0006}, {0x02, 0x200a, 0x0007}, {0x02, 0xa048, 0x0000}, {0x02, 0xc000, 0x0001}, {0x02, 0x000f, 0x0005}, {0x02, 0xa048, 0x0000}, {0x05, 0x0022, 0x0004}, {0x05, 0x0025, 0x0001}, {0x05, 0x0000, 0x0000}, /* Part 4 */ {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0001, 0x0001}, {0x05, 0x0000, 0x0000}, {0x05, 0x0021, 0x0001}, {0x05, 0x00d2, 0x0000}, {0x05, 0x0020, 0x0001}, {0x05, 0x0000, 0x0000}, {0x00, 0x0090, 0x0005}, {0x01, 0x00a6, 0x0000}, {0x02, 0x0000, 0x0005}, {0x05, 0x0026, 0x0001}, {0x05, 0x0001, 0x0000}, {0x05, 0x0027, 0x0001}, {0x05, 0x004e, 0x0000}, /* Part 5 */ {0x01, 0x0003, 0x003f}, {0x01, 0x0001, 0x0056}, {0x01, 0x000f, 0x0008}, {0x01, 0x002d, 0x0009}, {0x01, 0x0005, 0x000a}, {0x01, 0x0023, 0x000b}, {0x01, 0xffe0, 0x000c}, {0x01, 0xfffd, 0x000d}, {0x01, 0xfff4, 0x000e}, {0x01, 0xffe4, 0x000f}, {0x01, 0x0028, 0x0010}, {0x01, 0x00a8, 0x0001}, {0x01, 0x0066, 0x0007}, {0x01, 0x0032, 0x0017}, {0x01, 0x0023, 0x0018}, {0x01, 0x00ce, 0x0019}, {0x01, 0x0023, 0x001a}, {0x01, 0x0032, 0x001b}, {0x01, 0x008d, 0x001c}, {0x01, 0x00ce, 0x001d}, {0x01, 0x008d, 0x001e}, {0x01, 0x00c8, 0x0015}, /* c8 Poids fort Luma */ {0x01, 0x0032, 0x0016}, /* 32 */ {0x01, 0x0016, 0x0011}, /* R 00 */ {0x01, 0x0016, 0x0012}, /* G 00 */ {0x01, 0x0016, 0x0013}, /* B 00 */ {0x01, 0x000a, 0x0003}, {0x02, 0xc002, 0x0001}, {0x02, 0x0007, 0x0005}, {} }; static int reg_write(struct gspca_dev *gspca_dev, __u16 req, __u16 index, __u16 value) { int ret; struct usb_device *dev = gspca_dev->dev; ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), req, USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, NULL, 0, 500); gspca_dbg(gspca_dev, D_USBO, "reg write: 0x%02x 0x%02x 0x%02x\n", req, index, value); if (ret < 0) pr_err("reg write: error %d\n", ret); return ret; } static int write_vector(struct gspca_dev *gspca_dev, const __u16 data[][3]) { int ret, i = 0; while (data[i][0] != 0 || data[i][1] != 0 || data[i][2] != 0) { ret = reg_write(gspca_dev, data[i][0], data[i][2], data[i][1]); if (ret < 0) { gspca_err(gspca_dev, "Reg write failed for 0x%02x,0x%02x,0x%02x\n", data[i][0], data[i][1], data[i][2]); return ret; } i++; } return 0; } static void setbrightness(struct gspca_dev *gspca_dev, s32 val) { reg_write(gspca_dev, SPCA501_REG_CCDSP, 0x12, val); } static void setcontrast(struct gspca_dev *gspca_dev, s32 val) { reg_write(gspca_dev, 0x00, 0x00, (val >> 8) & 0xff); reg_write(gspca_dev, 0x00, 0x01, val & 0xff); } static void setcolors(struct gspca_dev *gspca_dev, s32 val) { reg_write(gspca_dev, SPCA501_REG_CCDSP, 0x0c, val); } static void setblue_balance(struct gspca_dev *gspca_dev, s32 val) { reg_write(gspca_dev, SPCA501_REG_CCDSP, 0x11, val); } static void setred_balance(struct gspca_dev *gspca_dev, s32 val) { reg_write(gspca_dev, SPCA501_REG_CCDSP, 0x13, val); } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; struct cam *cam; cam = &gspca_dev->cam; cam->cam_mode = vga_mode; cam->nmodes = ARRAY_SIZE(vga_mode); sd->subtype = id->driver_info; return 0; } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; switch (sd->subtype) { case Arowana300KCMOSCamera: case SmileIntlCamera: /* Arowana 300k CMOS Camera data */ if (write_vector(gspca_dev, spca501c_arowana_init_data)) goto error; break; case MystFromOriUnknownCamera: /* Unknown Ori CMOS Camera data */ if (write_vector(gspca_dev, spca501c_mysterious_open_data)) goto error; break; default: /* generic spca501 init data */ if (write_vector(gspca_dev, spca501_init_data)) goto error; break; } gspca_dbg(gspca_dev, D_STREAM, "Initializing SPCA501 finished\n"); return 0; error: return -EINVAL; } static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int mode; switch (sd->subtype) { case ThreeComHomeConnectLite: /* Special handling for 3com data */ write_vector(gspca_dev, spca501_3com_open_data); break; case Arowana300KCMOSCamera: case SmileIntlCamera: /* Arowana 300k CMOS Camera data */ write_vector(gspca_dev, spca501c_arowana_open_data); break; case MystFromOriUnknownCamera: /* Unknown CMOS Camera data */ write_vector(gspca_dev, spca501c_mysterious_init_data); break; default: /* Generic 501 open data */ write_vector(gspca_dev, spca501_open_data); } /* memorize the wanted pixel format */ mode = gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv; /* Enable ISO packet machine CTRL reg=2, * index=1 bitmask=0x2 (bit ordinal 1) */ reg_write(gspca_dev, SPCA50X_REG_USB, 0x6, 0x94); switch (mode) { case 0: /* 640x480 */ reg_write(gspca_dev, SPCA50X_REG_USB, 0x07, 0x004a); break; case 1: /* 320x240 */ reg_write(gspca_dev, SPCA50X_REG_USB, 0x07, 0x104a); break; default: /* case 2: * 160x120 */ reg_write(gspca_dev, SPCA50X_REG_USB, 0x07, 0x204a); break; } reg_write(gspca_dev, SPCA501_REG_CTLRL, 0x01, 0x02); return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { /* Disable ISO packet * machine CTRL reg=2, index=1 bitmask=0x0 (bit ordinal 1) */ reg_write(gspca_dev, SPCA501_REG_CTLRL, 0x01, 0x00); } /* called on streamoff with alt 0 and on disconnect */ static void sd_stop0(struct gspca_dev *gspca_dev) { if (!gspca_dev->present) return; reg_write(gspca_dev, SPCA501_REG_CTLRL, 0x05, 0x00); } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { switch (data[0]) { case 0: /* start of frame */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); data += SPCA501_OFFSET_DATA; len -= SPCA501_OFFSET_DATA; gspca_frame_add(gspca_dev, FIRST_PACKET, data, len); return; case 0xff: /* drop */ /* gspca_dev->last_packet_type = DISCARD_PACKET; */ return; } data++; len--; gspca_frame_add(gspca_dev, INTER_PACKET, data, len); } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: setbrightness(gspca_dev, ctrl->val); break; case V4L2_CID_CONTRAST: setcontrast(gspca_dev, ctrl->val); break; case V4L2_CID_SATURATION: setcolors(gspca_dev, ctrl->val); break; case V4L2_CID_BLUE_BALANCE: setblue_balance(gspca_dev, ctrl->val); break; case V4L2_CID_RED_BALANCE: setred_balance(gspca_dev, ctrl->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 5); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 127, 1, 0); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_CONTRAST, 0, 64725, 1, 64725); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SATURATION, 0, 63, 1, 20); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BLUE_BALANCE, 0, 127, 1, 0); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_RED_BALANCE, 0, 127, 1, 0); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .stop0 = sd_stop0, .pkt_scan = sd_pkt_scan, }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x040a, 0x0002), .driver_info = KodakDVC325}, {USB_DEVICE(0x0497, 0xc001), .driver_info = SmileIntlCamera}, {USB_DEVICE(0x0506, 0x00df), .driver_info = ThreeComHomeConnectLite}, {USB_DEVICE(0x0733, 0x0401), .driver_info = IntelCreateAndShare}, {USB_DEVICE(0x0733, 0x0402), .driver_info = ViewQuestM318B}, {USB_DEVICE(0x1776, 0x501c), .driver_info = Arowana300KCMOSCamera}, {USB_DEVICE(0x0000, 0x0000), .driver_info = MystFromOriUnknownCamera}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver);
2971 2972 3459 3459 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BACKING_DEV_DEFS_H #define __LINUX_BACKING_DEV_DEFS_H #include <linux/list.h> #include <linux/radix-tree.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/percpu_counter.h> #include <linux/percpu-refcount.h> #include <linux/flex_proportions.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/refcount.h> struct page; struct device; struct dentry; /* * Bits in bdi_writeback.state */ enum wb_state { WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ }; enum wb_stat_item { WB_RECLAIMABLE, WB_WRITEBACK, WB_DIRTIED, WB_WRITTEN, NR_WB_STAT_ITEMS }; #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) /* * why some writeback work was initiated */ enum wb_reason { WB_REASON_BACKGROUND, WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done * by emergency worker, however, this is TPs userland visible * and we'll be exposing exactly the same information, * so it has a mismatch name. */ WB_REASON_FORKER_THREAD, WB_REASON_FOREIGN_FLUSH, WB_REASON_MAX, }; struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; }; #define __WB_COMPLETION_INIT(_waitq) \ (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) } /* * If one wants to wait for one or more wb_writeback_works, each work's * ->done should be set to a wb_completion defined using the following * macro. Once all work items are issued with wb_queue_work(), the caller * can wait for the completion of all using wb_wait_for_completion(). Work * items which are waited upon aren't freed automatically on completion. */ #define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq) #define DEFINE_WB_COMPLETION(cmpl, bdi) \ struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) /* * Each wb (bdi_writeback) can perform writeback operations, is measured * and throttled, independently. Without cgroup writeback, each bdi * (bdi_writeback) is served by its embedded bdi->wb. * * On the default hierarchy, blkcg implicitly enables memcg. This allows * using memcg's page ownership for attributing writeback IOs, and every * memcg - blkcg combination can be served by its own wb by assigning a * dedicated wb to each memcg, which enables isolation across different * cgroups and propagation of IO back pressure down from the IO layer upto * the tasks which are generating the dirty pages to be written back. * * A cgroup wb is indexed on its bdi by the ID of the associated memcg, * refcounted with the number of inodes attached to it, and pins the memcg * and the corresponding blkcg. As the corresponding blkcg for a memcg may * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. * * Each bdi_writeback that is not embedded into the backing_dev_info must hold * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. * All the bdi tasks' dirty rate will be curbed under it. * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit * in small steps and is much more smooth/stable than the latter. */ unsigned long dirty_ratelimit; unsigned long balanced_dirty_ratelimit; struct fprop_local_percpu completions; int dirty_exceeded; enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; struct rcu_head rcu; }; #endif }; struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; /* * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are * any dirty wbs, which is depended upon by bdi_has_dirty(). */ atomic_long_t tot_write_bandwidth; /* * Jiffies when last process was dirty throttled on this bdi. Used by * blk-wbt. */ unsigned long last_bdp_sleep; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct list_head wb_list; /* list of all wbs */ #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #endif wait_queue_head_t wb_waitq; struct device *dev; char dev_name[64]; struct device *owner; struct timer_list laptop_mode_wb_timer; #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif }; struct wb_lock_cookie { bool locked; unsigned long flags; }; #ifdef CONFIG_CGROUP_WRITEBACK /** * wb_tryget - try to increment a wb's refcount * @wb: bdi_writeback to get */ static inline bool wb_tryget(struct bdi_writeback *wb) { if (wb != &wb->bdi->wb) return percpu_ref_tryget(&wb->refcnt); return true; } /** * wb_get - increment a wb's refcount * @wb: bdi_writeback to get */ static inline void wb_get(struct bdi_writeback *wb) { if (wb != &wb->bdi->wb) percpu_ref_get(&wb->refcnt); } /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put * @nr: number of references to put */ static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { if (WARN_ON_ONCE(!wb->bdi)) { /* * A driver bug might cause a file to be removed before bdi was * initialized. */ return; } if (wb != &wb->bdi->wb) percpu_ref_put_many(&wb->refcnt, nr); } /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put */ static inline void wb_put(struct bdi_writeback *wb) { wb_put_many(wb, 1); } /** * wb_dying - is a wb dying? * @wb: bdi_writeback of interest * * Returns whether @wb is unlinked and being drained. */ static inline bool wb_dying(struct bdi_writeback *wb) { return percpu_ref_is_dying(&wb->refcnt); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool wb_tryget(struct bdi_writeback *wb) { return true; } static inline void wb_get(struct bdi_writeback *wb) { } static inline void wb_put(struct bdi_writeback *wb) { } static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { } static inline bool wb_dying(struct bdi_writeback *wb) { return false; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* __LINUX_BACKING_DEV_DEFS_H */
32 32 32 9 9 9 32 32 32 32 32 32 9 9 9 8 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de> #include <linux/spinlock.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/irq.h> #define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS)) struct cpumap { unsigned int available; unsigned int allocated; unsigned int managed; unsigned int managed_allocated; bool initialized; bool online; unsigned long alloc_map[IRQ_MATRIX_SIZE]; unsigned long managed_map[IRQ_MATRIX_SIZE]; }; struct irq_matrix { unsigned int matrix_bits; unsigned int alloc_start; unsigned int alloc_end; unsigned int alloc_size; unsigned int global_available; unsigned int global_reserved; unsigned int systembits_inalloc; unsigned int total_allocated; unsigned int online_maps; struct cpumap __percpu *maps; unsigned long scratch_map[IRQ_MATRIX_SIZE]; unsigned long system_map[IRQ_MATRIX_SIZE]; }; #define CREATE_TRACE_POINTS #include <trace/events/irq_matrix.h> /** * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS * @alloc_start: From which bit the allocation search starts * @alloc_end: At which bit the allocation search ends, i.e first * invalid bit */ __init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end) { struct irq_matrix *m; if (matrix_bits > IRQ_MATRIX_BITS) return NULL; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) return NULL; m->matrix_bits = matrix_bits; m->alloc_start = alloc_start; m->alloc_end = alloc_end; m->alloc_size = alloc_end - alloc_start; m->maps = alloc_percpu(*m->maps); if (!m->maps) { kfree(m); return NULL; } return m; } /** * irq_matrix_online - Bring the local CPU matrix online * @m: Matrix pointer */ void irq_matrix_online(struct irq_matrix *m) { struct cpumap *cm = this_cpu_ptr(m->maps); BUG_ON(cm->online); if (!cm->initialized) { cm->available = m->alloc_size; cm->available -= cm->managed + m->systembits_inalloc; cm->initialized = true; } m->global_available += cm->available; cm->online = true; m->online_maps++; trace_irq_matrix_online(m); } /** * irq_matrix_offline - Bring the local CPU matrix offline * @m: Matrix pointer */ void irq_matrix_offline(struct irq_matrix *m) { struct cpumap *cm = this_cpu_ptr(m->maps); /* Update the global available size */ m->global_available -= cm->available; cm->online = false; m->online_maps--; trace_irq_matrix_offline(m); } static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, unsigned int num, bool managed) { unsigned int area, start = m->alloc_start; unsigned int end = m->alloc_end; bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end); bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end); area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0); if (area >= end) return area; if (managed) bitmap_set(cm->managed_map, area, num); else bitmap_set(cm->alloc_map, area, num); return area; } /* Find the best CPU which has the lowest vector allocation count */ static unsigned int matrix_find_best_cpu(struct irq_matrix *m, const struct cpumask *msk) { unsigned int cpu, best_cpu, maxavl = 0; struct cpumap *cm; best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { cm = per_cpu_ptr(m->maps, cpu); if (!cm->online || cm->available <= maxavl) continue; best_cpu = cpu; maxavl = cm->available; } return best_cpu; } /* Find the best CPU which has the lowest number of managed IRQs allocated */ static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m, const struct cpumask *msk) { unsigned int cpu, best_cpu, allocated = UINT_MAX; struct cpumap *cm; best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { cm = per_cpu_ptr(m->maps, cpu); if (!cm->online || cm->managed_allocated > allocated) continue; best_cpu = cpu; allocated = cm->managed_allocated; } return best_cpu; } /** * irq_matrix_assign_system - Assign system wide entry in the matrix * @m: Matrix pointer * @bit: Which bit to reserve * @replace: Replace an already allocated vector with a system * vector at the same bit position. * * The BUG_ON()s below are on purpose. If this goes wrong in the * early boot process, then the chance to survive is about zero. * If this happens when the system is life, it's not much better. */ void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace) { struct cpumap *cm = this_cpu_ptr(m->maps); BUG_ON(bit > m->matrix_bits); BUG_ON(m->online_maps > 1 || (m->online_maps && !replace)); set_bit(bit, m->system_map); if (replace) { BUG_ON(!test_and_clear_bit(bit, cm->alloc_map)); cm->allocated--; m->total_allocated--; } if (bit >= m->alloc_start && bit < m->alloc_end) m->systembits_inalloc++; trace_irq_matrix_assign_system(bit, m); } /** * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map * @m: Matrix pointer * @msk: On which CPUs the bits should be reserved. * * Can be called for offline CPUs. Note, this will only reserve one bit * on all CPUs in @msk, but it's not guaranteed that the bits are at the * same offset on all CPUs */ int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk) { unsigned int cpu, failed_cpu; for_each_cpu(cpu, msk) { struct cpumap *cm = per_cpu_ptr(m->maps, cpu); unsigned int bit; bit = matrix_alloc_area(m, cm, 1, true); if (bit >= m->alloc_end) goto cleanup; cm->managed++; if (cm->online) { cm->available--; m->global_available--; } trace_irq_matrix_reserve_managed(bit, cpu, m, cm); } return 0; cleanup: failed_cpu = cpu; for_each_cpu(cpu, msk) { if (cpu == failed_cpu) break; irq_matrix_remove_managed(m, cpumask_of(cpu)); } return -ENOSPC; } /** * irq_matrix_remove_managed - Remove managed interrupts in a CPU map * @m: Matrix pointer * @msk: On which CPUs the bits should be removed * * Can be called for offline CPUs * * This removes not allocated managed interrupts from the map. It does * not matter which one because the managed interrupts free their * allocation when they shut down. If not, the accounting is screwed, * but all what can be done at this point is warn about it. */ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) { unsigned int cpu; for_each_cpu(cpu, msk) { struct cpumap *cm = per_cpu_ptr(m->maps, cpu); unsigned int bit, end = m->alloc_end; if (WARN_ON_ONCE(!cm->managed)) continue; /* Get managed bit which are not allocated */ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); bit = find_first_bit(m->scratch_map, end); if (WARN_ON_ONCE(bit >= end)) continue; clear_bit(bit, cm->managed_map); cm->managed--; if (cm->online) { cm->available++; m->global_available++; } trace_irq_matrix_remove_managed(bit, cpu, m, cm); } } /** * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map * @m: Matrix pointer * @msk: Which CPUs to search in * @mapped_cpu: Pointer to store the CPU for which the irq was allocated */ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu) { unsigned int bit, cpu, end; struct cpumap *cm; if (cpumask_empty(msk)) return -EINVAL; cpu = matrix_find_best_cpu_managed(m, msk); if (cpu == UINT_MAX) return -ENOSPC; cm = per_cpu_ptr(m->maps, cpu); end = m->alloc_end; /* Get managed bit which are not allocated */ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); bit = find_first_bit(m->scratch_map, end); if (bit >= end) return -ENOSPC; set_bit(bit, cm->alloc_map); cm->allocated++; cm->managed_allocated++; m->total_allocated++; *mapped_cpu = cpu; trace_irq_matrix_alloc_managed(bit, cpu, m, cm); return bit; } /** * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map * @m: Matrix pointer * @bit: Which bit to mark * * This should only be used to mark preallocated vectors */ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) { struct cpumap *cm = this_cpu_ptr(m->maps); if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map))) return; cm->allocated++; m->total_allocated++; cm->available--; m->global_available--; trace_irq_matrix_assign(bit, smp_processor_id(), m, cm); } /** * irq_matrix_reserve - Reserve interrupts * @m: Matrix pointer * * This is merely a book keeping call. It increments the number of globally * reserved interrupt bits w/o actually allocating them. This allows to * setup interrupt descriptors w/o assigning low level resources to it. * The actual allocation happens when the interrupt gets activated. */ void irq_matrix_reserve(struct irq_matrix *m) { if (m->global_reserved == m->global_available) pr_warn("Interrupt reservation exceeds available resources\n"); m->global_reserved++; trace_irq_matrix_reserve(m); } /** * irq_matrix_remove_reserved - Remove interrupt reservation * @m: Matrix pointer * * This is merely a book keeping call. It decrements the number of globally * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the * interrupt was never in use and a real vector allocated, which undid the * reservation. */ void irq_matrix_remove_reserved(struct irq_matrix *m) { m->global_reserved--; trace_irq_matrix_remove_reserved(m); } /** * irq_matrix_alloc - Allocate a regular interrupt in a CPU map * @m: Matrix pointer * @msk: Which CPUs to search in * @reserved: Allocate previously reserved interrupts * @mapped_cpu: Pointer to store the CPU for which the irq was allocated */ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { unsigned int cpu, bit; struct cpumap *cm; /* * Not required in theory, but matrix_find_best_cpu() uses * for_each_cpu() which ignores the cpumask on UP . */ if (cpumask_empty(msk)) return -EINVAL; cpu = matrix_find_best_cpu(m, msk); if (cpu == UINT_MAX) return -ENOSPC; cm = per_cpu_ptr(m->maps, cpu); bit = matrix_alloc_area(m, cm, 1, false); if (bit >= m->alloc_end) return -ENOSPC; cm->allocated++; cm->available--; m->total_allocated++; m->global_available--; if (reserved) m->global_reserved--; *mapped_cpu = cpu; trace_irq_matrix_alloc(bit, cpu, m, cm); return bit; } /** * irq_matrix_free - Free allocated interrupt in the matrix * @m: Matrix pointer * @cpu: Which CPU map needs be updated * @bit: The bit to remove * @managed: If true, the interrupt is managed and not accounted * as available. */ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, unsigned int bit, bool managed) { struct cpumap *cm = per_cpu_ptr(m->maps, cpu); if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; if (WARN_ON_ONCE(!test_and_clear_bit(bit, cm->alloc_map))) return; cm->allocated--; if(managed) cm->managed_allocated--; if (cm->online) m->total_allocated--; if (!managed) { cm->available++; if (cm->online) m->global_available++; } trace_irq_matrix_free(bit, cpu, m, cm); } /** * irq_matrix_available - Get the number of globally available irqs * @m: Pointer to the matrix to query * @cpudown: If true, the local CPU is about to go down, adjust * the number of available irqs accordingly */ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) { struct cpumap *cm = this_cpu_ptr(m->maps); if (!cpudown) return m->global_available; return m->global_available - cm->available; } /** * irq_matrix_reserved - Get the number of globally reserved irqs * @m: Pointer to the matrix to query */ unsigned int irq_matrix_reserved(struct irq_matrix *m) { return m->global_reserved; } /** * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU * @m: Pointer to the matrix to search * * This returns number of allocated non-managed interrupts. */ unsigned int irq_matrix_allocated(struct irq_matrix *m) { struct cpumap *cm = this_cpu_ptr(m->maps); return cm->allocated - cm->managed_allocated; } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS /** * irq_matrix_debug_show - Show detailed allocation information * @sf: Pointer to the seq_file to print to * @m: Pointer to the matrix allocator * @ind: Indentation for the print format * * Note, this is a lockless snapshot. */ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) { unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits); int cpu; seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps); seq_printf(sf, "Global available: %6u\n", m->global_available); seq_printf(sf, "Global reserved: %6u\n", m->global_reserved); seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, m->system_map); seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " "); cpus_read_lock(); for_each_online_cpu(cpu) { struct cpumap *cm = per_cpu_ptr(m->maps, cpu); seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ", cpu, cm->available, cm->managed, cm->managed_allocated, cm->allocated, m->matrix_bits, cm->alloc_map); } cpus_read_unlock(); } #endif
4 4 4 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 // SPDX-License-Identifier: GPL-2.0-only /* * SCSI RDMA (SRP) transport class * * Copyright (C) 2007 FUJITA Tomonori <tomof@acm.org> */ #include <linux/init.h> #include <linux/module.h> #include <linux/jiffies.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/string.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_transport_srp.h> #include "scsi_priv.h" struct srp_host_attrs { atomic_t next_port_id; }; #define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) #define SRP_HOST_ATTRS 0 #define SRP_RPORT_ATTRS 8 struct srp_internal { struct scsi_transport_template t; struct srp_function_template *f; struct device_attribute *host_attrs[SRP_HOST_ATTRS + 1]; struct device_attribute *rport_attrs[SRP_RPORT_ATTRS + 1]; struct transport_container rport_attr_cont; }; static int scsi_is_srp_rport(const struct device *dev); #define to_srp_internal(tmpl) container_of(tmpl, struct srp_internal, t) #define dev_to_rport(d) container_of(d, struct srp_rport, dev) #define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent) static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) { return dev_to_shost(r->dev.parent); } static int find_child_rport(struct device *dev, void *data) { struct device **child = data; if (scsi_is_srp_rport(dev)) { WARN_ON_ONCE(*child); *child = dev; } return 0; } static inline struct srp_rport *shost_to_rport(struct Scsi_Host *shost) { struct device *child = NULL; WARN_ON_ONCE(device_for_each_child(&shost->shost_gendev, &child, find_child_rport) < 0); return child ? dev_to_rport(child) : NULL; } /** * srp_tmo_valid() - check timeout combination validity * @reconnect_delay: Reconnect delay in seconds. * @fast_io_fail_tmo: Fast I/O fail timeout in seconds. * @dev_loss_tmo: Device loss timeout in seconds. * * The combination of the timeout parameters must be such that SCSI commands * are finished in a reasonable time. Hence do not allow the fast I/O fail * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT nor allow dev_loss_tmo to * exceed that limit if failing I/O fast has been disabled. Furthermore, these * parameters must be such that multipath can detect failed paths timely. * Hence do not allow all three parameters to be disabled simultaneously. */ int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, long dev_loss_tmo) { if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0) return -EINVAL; if (reconnect_delay == 0) return -EINVAL; if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) return -EINVAL; if (fast_io_fail_tmo < 0 && dev_loss_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) return -EINVAL; if (dev_loss_tmo >= LONG_MAX / HZ) return -EINVAL; if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 && fast_io_fail_tmo >= dev_loss_tmo) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(srp_tmo_valid); static int srp_host_setup(struct transport_container *tc, struct device *dev, struct device *cdev) { struct Scsi_Host *shost = dev_to_shost(dev); struct srp_host_attrs *srp_host = to_srp_host_attrs(shost); atomic_set(&srp_host->next_port_id, 0); return 0; } static DECLARE_TRANSPORT_CLASS(srp_host_class, "srp_host", srp_host_setup, NULL, NULL); static DECLARE_TRANSPORT_CLASS(srp_rport_class, "srp_remote_ports", NULL, NULL, NULL); static ssize_t show_srp_rport_id(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_rport *rport = transport_class_to_srp_rport(dev); return sprintf(buf, "%16phC\n", rport->port_id); } static DEVICE_ATTR(port_id, S_IRUGO, show_srp_rport_id, NULL); static const struct { u32 value; char *name; } srp_rport_role_names[] = { {SRP_RPORT_ROLE_INITIATOR, "SRP Initiator"}, {SRP_RPORT_ROLE_TARGET, "SRP Target"}, }; static ssize_t show_srp_rport_roles(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_rport *rport = transport_class_to_srp_rport(dev); int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(srp_rport_role_names); i++) if (srp_rport_role_names[i].value == rport->roles) { name = srp_rport_role_names[i].name; break; } return sprintf(buf, "%s\n", name ? : "unknown"); } static DEVICE_ATTR(roles, S_IRUGO, show_srp_rport_roles, NULL); static ssize_t store_srp_rport_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct srp_rport *rport = transport_class_to_srp_rport(dev); struct Scsi_Host *shost = dev_to_shost(dev); struct srp_internal *i = to_srp_internal(shost->transportt); if (i->f->rport_delete) { i->f->rport_delete(rport); return count; } else { return -ENOSYS; } } static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete); static ssize_t show_srp_rport_state(struct device *dev, struct device_attribute *attr, char *buf) { static const char *const state_name[] = { [SRP_RPORT_RUNNING] = "running", [SRP_RPORT_BLOCKED] = "blocked", [SRP_RPORT_FAIL_FAST] = "fail-fast", [SRP_RPORT_LOST] = "lost", }; struct srp_rport *rport = transport_class_to_srp_rport(dev); enum srp_rport_state state = rport->state; return sprintf(buf, "%s\n", (unsigned)state < ARRAY_SIZE(state_name) ? state_name[state] : "???"); } static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL); static ssize_t srp_show_tmo(char *buf, int tmo) { return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n"); } int srp_parse_tmo(int *tmo, const char *buf) { int res = 0; if (strncmp(buf, "off", 3) != 0) res = kstrtoint(buf, 0, tmo); else *tmo = -1; return res; } EXPORT_SYMBOL(srp_parse_tmo); static ssize_t show_reconnect_delay(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_rport *rport = transport_class_to_srp_rport(dev); return srp_show_tmo(buf, rport->reconnect_delay); } static ssize_t store_reconnect_delay(struct device *dev, struct device_attribute *attr, const char *buf, const size_t count) { struct srp_rport *rport = transport_class_to_srp_rport(dev); int res, delay; res = srp_parse_tmo(&delay, buf); if (res) goto out; res = srp_tmo_valid(delay, rport->fast_io_fail_tmo, rport->dev_loss_tmo); if (res) goto out; if (rport->reconnect_delay <= 0 && delay > 0 && rport->state != SRP_RPORT_RUNNING) { queue_delayed_work(system_long_wq, &rport->reconnect_work, delay * HZ); } else if (delay <= 0) { cancel_delayed_work(&rport->reconnect_work); } rport->reconnect_delay = delay; res = count; out: return res; } static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay, store_reconnect_delay); static ssize_t show_failed_reconnects(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_rport *rport = transport_class_to_srp_rport(dev); return sprintf(buf, "%d\n", rport->failed_reconnects); } static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL); static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_rport *rport = transport_class_to_srp_rport(dev); return srp_show_tmo(buf, rport->fast_io_fail_tmo); } static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct srp_rport *rport = transport_class_to_srp_rport(dev); int res; int fast_io_fail_tmo; res = srp_parse_tmo(&fast_io_fail_tmo, buf); if (res) goto out; res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo, rport->dev_loss_tmo); if (res) goto out; rport->fast_io_fail_tmo = fast_io_fail_tmo; res = count; out: return res; } static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, show_srp_rport_fast_io_fail_tmo, store_srp_rport_fast_io_fail_tmo); static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_rport *rport = transport_class_to_srp_rport(dev); return srp_show_tmo(buf, rport->dev_loss_tmo); } static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct srp_rport *rport = transport_class_to_srp_rport(dev); int res; int dev_loss_tmo; res = srp_parse_tmo(&dev_loss_tmo, buf); if (res) goto out; res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo, dev_loss_tmo); if (res) goto out; rport->dev_loss_tmo = dev_loss_tmo; res = count; out: return res; } static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR, show_srp_rport_dev_loss_tmo, store_srp_rport_dev_loss_tmo); static int srp_rport_set_state(struct srp_rport *rport, enum srp_rport_state new_state) { enum srp_rport_state old_state = rport->state; lockdep_assert_held(&rport->mutex); switch (new_state) { case SRP_RPORT_RUNNING: switch (old_state) { case SRP_RPORT_LOST: goto invalid; default: break; } break; case SRP_RPORT_BLOCKED: switch (old_state) { case SRP_RPORT_RUNNING: break; default: goto invalid; } break; case SRP_RPORT_FAIL_FAST: switch (old_state) { case SRP_RPORT_LOST: goto invalid; default: break; } break; case SRP_RPORT_LOST: break; } rport->state = new_state; return 0; invalid: return -EINVAL; } /** * srp_reconnect_work() - reconnect and schedule a new attempt if necessary * @work: Work structure used for scheduling this operation. */ static void srp_reconnect_work(struct work_struct *work) { struct srp_rport *rport = container_of(to_delayed_work(work), struct srp_rport, reconnect_work); struct Scsi_Host *shost = rport_to_shost(rport); int delay, res; res = srp_reconnect_rport(rport); if (res != 0) { shost_printk(KERN_ERR, shost, "reconnect attempt %d failed (%d)\n", ++rport->failed_reconnects, res); delay = rport->reconnect_delay * min(100, max(1, rport->failed_reconnects - 10)); if (delay > 0) queue_delayed_work(system_long_wq, &rport->reconnect_work, delay * HZ); } } /* * scsi_block_targets() must have been called before this function is * called to guarantee that no .queuecommand() calls are in progress. */ static void __rport_fail_io_fast(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); struct srp_internal *i; lockdep_assert_held(&rport->mutex); if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST)) return; scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); /* Involve the LLD if possible to terminate all I/O on the rport. */ i = to_srp_internal(shost->transportt); if (i->f->terminate_rport_io) i->f->terminate_rport_io(rport); } /** * rport_fast_io_fail_timedout() - fast I/O failure timeout handler * @work: Work structure used for scheduling this operation. */ static void rport_fast_io_fail_timedout(struct work_struct *work) { struct srp_rport *rport = container_of(to_delayed_work(work), struct srp_rport, fast_io_fail_work); struct Scsi_Host *shost = rport_to_shost(rport); pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n", dev_name(&rport->dev), dev_name(&shost->shost_gendev)); mutex_lock(&rport->mutex); if (rport->state == SRP_RPORT_BLOCKED) __rport_fail_io_fast(rport); mutex_unlock(&rport->mutex); } /** * rport_dev_loss_timedout() - device loss timeout handler * @work: Work structure used for scheduling this operation. */ static void rport_dev_loss_timedout(struct work_struct *work) { struct srp_rport *rport = container_of(to_delayed_work(work), struct srp_rport, dev_loss_work); struct Scsi_Host *shost = rport_to_shost(rport); struct srp_internal *i = to_srp_internal(shost->transportt); pr_info("dev_loss_tmo expired for SRP %s / %s.\n", dev_name(&rport->dev), dev_name(&shost->shost_gendev)); mutex_lock(&rport->mutex); WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0); scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); mutex_unlock(&rport->mutex); i->f->rport_delete(rport); } static void __srp_start_tl_fail_timers(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); int delay, fast_io_fail_tmo, dev_loss_tmo; lockdep_assert_held(&rport->mutex); delay = rport->reconnect_delay; fast_io_fail_tmo = rport->fast_io_fail_tmo; dev_loss_tmo = rport->dev_loss_tmo; pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev), rport->state); if (rport->state == SRP_RPORT_LOST) return; if (delay > 0) queue_delayed_work(system_long_wq, &rport->reconnect_work, 1UL * delay * HZ); if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) && srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev), rport->state); scsi_block_targets(shost, &shost->shost_gendev); if (fast_io_fail_tmo >= 0) queue_delayed_work(system_long_wq, &rport->fast_io_fail_work, 1UL * fast_io_fail_tmo * HZ); if (dev_loss_tmo >= 0) queue_delayed_work(system_long_wq, &rport->dev_loss_work, 1UL * dev_loss_tmo * HZ); } } /** * srp_start_tl_fail_timers() - start the transport layer failure timers * @rport: SRP target port. * * Start the transport layer fast I/O failure and device loss timers. Do not * modify a timer that was already started. */ void srp_start_tl_fail_timers(struct srp_rport *rport) { mutex_lock(&rport->mutex); __srp_start_tl_fail_timers(rport); mutex_unlock(&rport->mutex); } EXPORT_SYMBOL(srp_start_tl_fail_timers); /** * srp_reconnect_rport() - reconnect to an SRP target port * @rport: SRP target port. * * Blocks SCSI command queueing before invoking reconnect() such that * queuecommand() won't be invoked concurrently with reconnect() from outside * the SCSI EH. This is important since a reconnect() implementation may * reallocate resources needed by queuecommand(). * * Notes: * - This function neither waits until outstanding requests have finished nor * tries to abort these. It is the responsibility of the reconnect() * function to finish outstanding commands before reconnecting to the target * port. * - It is the responsibility of the caller to ensure that the resources * reallocated by the reconnect() function won't be used while this function * is in progress. One possible strategy is to invoke this function from * the context of the SCSI EH thread only. Another possible strategy is to * lock the rport mutex inside each SCSI LLD callback that can be invoked by * the SCSI EH (the scsi_host_template.eh_*() functions and also the * scsi_host_template.queuecommand() function). */ int srp_reconnect_rport(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); struct srp_internal *i = to_srp_internal(shost->transportt); struct scsi_device *sdev; int res; pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev)); res = mutex_lock_interruptible(&rport->mutex); if (res) goto out; if (rport->state != SRP_RPORT_FAIL_FAST && rport->state != SRP_RPORT_LOST) /* * sdev state must be SDEV_TRANSPORT_OFFLINE, transition * to SDEV_BLOCK is illegal. Calling scsi_target_unblock() * later is ok though, scsi_internal_device_unblock_nowait() * treats SDEV_TRANSPORT_OFFLINE like SDEV_BLOCK. */ scsi_block_targets(shost, &shost->shost_gendev); res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV; pr_debug("%s (state %d): transport.reconnect() returned %d\n", dev_name(&shost->shost_gendev), rport->state, res); if (res == 0) { cancel_delayed_work(&rport->fast_io_fail_work); cancel_delayed_work(&rport->dev_loss_work); rport->failed_reconnects = 0; srp_rport_set_state(rport, SRP_RPORT_RUNNING); scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); /* * If the SCSI error handler has offlined one or more devices, * invoking scsi_target_unblock() won't change the state of * these devices into running so do that explicitly. */ shost_for_each_device(sdev, shost) { mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_OFFLINE) sdev->sdev_state = SDEV_RUNNING; mutex_unlock(&sdev->state_mutex); } } else if (rport->state == SRP_RPORT_RUNNING) { /* * srp_reconnect_rport() has been invoked with fast_io_fail * and dev_loss off. Mark the port as failed and start the TL * failure timers if these had not yet been started. */ __rport_fail_io_fast(rport); __srp_start_tl_fail_timers(rport); } else if (rport->state != SRP_RPORT_BLOCKED) { scsi_target_unblock(&shost->shost_gendev, SDEV_TRANSPORT_OFFLINE); } mutex_unlock(&rport->mutex); out: return res; } EXPORT_SYMBOL(srp_reconnect_rport); /** * srp_timed_out() - SRP transport intercept of the SCSI timeout EH * @scmd: SCSI command. * * If a timeout occurs while an rport is in the blocked state, ask the SCSI * EH to continue waiting (SCSI_EH_RESET_TIMER). Otherwise let the SCSI core * handle the timeout (SCSI_EH_NOT_HANDLED). * * Note: This function is called from soft-IRQ context and with the request * queue lock held. */ enum scsi_timeout_action srp_timed_out(struct scsi_cmnd *scmd) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; struct srp_internal *i = to_srp_internal(shost->transportt); struct srp_rport *rport = shost_to_rport(shost); pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev)); return rport && rport->fast_io_fail_tmo < 0 && rport->dev_loss_tmo < 0 && i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? SCSI_EH_RESET_TIMER : SCSI_EH_NOT_HANDLED; } EXPORT_SYMBOL(srp_timed_out); static void srp_rport_release(struct device *dev) { struct srp_rport *rport = dev_to_rport(dev); put_device(dev->parent); kfree(rport); } static int scsi_is_srp_rport(const struct device *dev) { return dev->release == srp_rport_release; } static int srp_rport_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct srp_internal *i; if (!scsi_is_srp_rport(dev)) return 0; shost = dev_to_shost(dev->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &srp_host_class.class) return 0; i = to_srp_internal(shost->transportt); return &i->rport_attr_cont.ac == cont; } static int srp_host_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct srp_internal *i; if (!scsi_is_host_device(dev)) return 0; shost = dev_to_shost(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &srp_host_class.class) return 0; i = to_srp_internal(shost->transportt); return &i->t.host_attrs.ac == cont; } /** * srp_rport_get() - increment rport reference count * @rport: SRP target port. */ void srp_rport_get(struct srp_rport *rport) { get_device(&rport->dev); } EXPORT_SYMBOL(srp_rport_get); /** * srp_rport_put() - decrement rport reference count * @rport: SRP target port. */ void srp_rport_put(struct srp_rport *rport) { put_device(&rport->dev); } EXPORT_SYMBOL(srp_rport_put); /** * srp_rport_add - add a SRP remote port to the device hierarchy * @shost: scsi host the remote port is connected to. * @ids: The port id for the remote port. * * Publishes a port to the rest of the system. */ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, struct srp_rport_identifiers *ids) { struct srp_rport *rport; struct device *parent = &shost->shost_gendev; struct srp_internal *i = to_srp_internal(shost->transportt); int id, ret; rport = kzalloc(sizeof(*rport), GFP_KERNEL); if (!rport) return ERR_PTR(-ENOMEM); mutex_init(&rport->mutex); device_initialize(&rport->dev); rport->dev.parent = get_device(parent); rport->dev.release = srp_rport_release; memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); rport->roles = ids->roles; if (i->f->reconnect) rport->reconnect_delay = i->f->reconnect_delay ? *i->f->reconnect_delay : 10; INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work); rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? *i->f->fast_io_fail_tmo : 15; rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; INIT_DELAYED_WORK(&rport->fast_io_fail_work, rport_fast_io_fail_timedout); INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout); id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id); dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id); transport_setup_device(&rport->dev); ret = device_add(&rport->dev); if (ret) { transport_destroy_device(&rport->dev); put_device(&rport->dev); return ERR_PTR(ret); } transport_add_device(&rport->dev); transport_configure_device(&rport->dev); return rport; } EXPORT_SYMBOL_GPL(srp_rport_add); /** * srp_rport_del - remove a SRP remote port * @rport: SRP remote port to remove * * Removes the specified SRP remote port. */ void srp_rport_del(struct srp_rport *rport) { struct device *dev = &rport->dev; transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL_GPL(srp_rport_del); static int do_srp_rport_del(struct device *dev, void *data) { if (scsi_is_srp_rport(dev)) srp_rport_del(dev_to_rport(dev)); return 0; } /** * srp_remove_host - tear down a Scsi_Host's SRP data structures * @shost: Scsi Host that is torn down * * Removes all SRP remote ports for a given Scsi_Host. * Must be called just before scsi_remove_host for SRP HBAs. */ void srp_remove_host(struct Scsi_Host *shost) { device_for_each_child(&shost->shost_gendev, NULL, do_srp_rport_del); } EXPORT_SYMBOL_GPL(srp_remove_host); /** * srp_stop_rport_timers - stop the transport layer recovery timers * @rport: SRP remote port for which to stop the timers. * * Must be called after srp_remove_host() and scsi_remove_host(). The caller * must hold a reference on the rport (rport->dev) and on the SCSI host * (rport->dev.parent). */ void srp_stop_rport_timers(struct srp_rport *rport) { mutex_lock(&rport->mutex); if (rport->state == SRP_RPORT_BLOCKED) __rport_fail_io_fast(rport); srp_rport_set_state(rport, SRP_RPORT_LOST); mutex_unlock(&rport->mutex); cancel_delayed_work_sync(&rport->reconnect_work); cancel_delayed_work_sync(&rport->fast_io_fail_work); cancel_delayed_work_sync(&rport->dev_loss_work); } EXPORT_SYMBOL_GPL(srp_stop_rport_timers); /** * srp_attach_transport - instantiate SRP transport template * @ft: SRP transport class function template */ struct scsi_transport_template * srp_attach_transport(struct srp_function_template *ft) { int count; struct srp_internal *i; i = kzalloc(sizeof(*i), GFP_KERNEL); if (!i) return NULL; i->t.host_size = sizeof(struct srp_host_attrs); i->t.host_attrs.ac.attrs = &i->host_attrs[0]; i->t.host_attrs.ac.class = &srp_host_class.class; i->t.host_attrs.ac.match = srp_host_match; i->host_attrs[0] = NULL; transport_container_register(&i->t.host_attrs); i->rport_attr_cont.ac.attrs = &i->rport_attrs[0]; i->rport_attr_cont.ac.class = &srp_rport_class.class; i->rport_attr_cont.ac.match = srp_rport_match; count = 0; i->rport_attrs[count++] = &dev_attr_port_id; i->rport_attrs[count++] = &dev_attr_roles; if (ft->has_rport_state) { i->rport_attrs[count++] = &dev_attr_state; i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; } if (ft->reconnect) { i->rport_attrs[count++] = &dev_attr_reconnect_delay; i->rport_attrs[count++] = &dev_attr_failed_reconnects; } if (ft->rport_delete) i->rport_attrs[count++] = &dev_attr_delete; i->rport_attrs[count++] = NULL; BUG_ON(count > ARRAY_SIZE(i->rport_attrs)); transport_container_register(&i->rport_attr_cont); i->f = ft; return &i->t; } EXPORT_SYMBOL_GPL(srp_attach_transport); /** * srp_release_transport - release SRP transport template instance * @t: transport template instance */ void srp_release_transport(struct scsi_transport_template *t) { struct srp_internal *i = to_srp_internal(t); transport_container_unregister(&i->t.host_attrs); transport_container_unregister(&i->rport_attr_cont); kfree(i); } EXPORT_SYMBOL_GPL(srp_release_transport); static __init int srp_transport_init(void) { int ret; ret = transport_class_register(&srp_host_class); if (ret) return ret; ret = transport_class_register(&srp_rport_class); if (ret) goto unregister_host_class; return 0; unregister_host_class: transport_class_unregister(&srp_host_class); return ret; } static void __exit srp_transport_exit(void) { transport_class_unregister(&srp_host_class); transport_class_unregister(&srp_rport_class); } MODULE_AUTHOR("FUJITA Tomonori"); MODULE_DESCRIPTION("SRP Transport Attributes"); MODULE_LICENSE("GPL"); module_init(srp_transport_init); module_exit(srp_transport_exit);
222 23 239 3 18 18 16 2 157 135 134 137 8 37 144 145 37 20 7 82 1 1 135 135 135 3 133 135 137 5 121 13 4 26 119 4 116 147 147 39 110 148 147 148 113 36 92 92 91 92 29 64 91 7 2 1 1 5 2 7 1 1 2 1 2 2 9 2 7 2 9 2 263 246 17 4 260 260 260 262 1 107 25 25 295 23 16 10 9 1 10 273 287 2 332 3 3 11 1 1 84 324 26 26 5 13 139 125 5 91 90 90 91 3 88 3 102 99 90 3 268 80 294 11 157 6 138 400 1 2 376 1 21 332 396 2 2 2 394 107 268 374 76 298 254 11 39 108 68 176 394 391 6 164 244 2 146 146 244 396 92 92 92 102 102 3 99 3 4 1 2 2 4 2 2 2 1 4 3 2 1 1 5 5 4 5 1 4 23 16 6 1 9 7 7 4 5 4 13 3 139 3 382 244 139 139 407 410 1 3 383 16 7 2 2 1 17 1 1 5 2 6 5 2 5 2 5 2 7 1 6 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_api.c Packet action API. * * Author: Jamal Hadi Salim */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_pedit.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/tc_wrapper.h> #ifdef CONFIG_INET DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); EXPORT_SYMBOL_GPL(tcf_frag_xmit_count); #endif int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) { #ifdef CONFIG_INET if (static_branch_unlikely(&tcf_frag_xmit_count)) return sch_frag_xmit_hook(skb, xmit); #endif return xmit(skb); } EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit); static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain); res->goto_tp = rcu_dereference_bh(chain->filter_chain); } static void tcf_free_cookie_rcu(struct rcu_head *p) { struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); kfree(cookie->data); kfree(cookie); } static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, struct tc_cookie *new_cookie) { struct tc_cookie *old; old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, struct tcf_chain **newchain, struct netlink_ext_ack *extack) { int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL; u32 chain_index; if (!opcode) ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0; else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC) ret = 0; if (ret) { NL_SET_ERR_MSG(extack, "invalid control action"); goto end; } if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) { chain_index = action & TC_ACT_EXT_VAL_MASK; if (!tp || !newchain) { ret = -EINVAL; NL_SET_ERR_MSG(extack, "can't goto NULL proto/chain"); goto end; } *newchain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!*newchain) { ret = -ENOMEM; NL_SET_ERR_MSG(extack, "can't allocate goto_chain"); } } end: return ret; } EXPORT_SYMBOL(tcf_action_check_ctrlact); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *goto_chain) { a->tcfa_action = action; goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1); return goto_chain; } EXPORT_SYMBOL(tcf_action_set_ctrlact); /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected * from filters. Readers later can not find us. */ static void free_tcf(struct tc_action *p) { struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1); free_percpu(p->cpu_bstats); free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->user_cookie, NULL); if (chain) tcf_chain_put_by_act(chain); kfree(p); } static void offload_action_hw_count_set(struct tc_action *act, u32 hw_count) { act->in_hw_count = hw_count; } static void offload_action_hw_count_inc(struct tc_action *act, u32 hw_count) { act->in_hw_count += hw_count; } static void offload_action_hw_count_dec(struct tc_action *act, u32 hw_count) { act->in_hw_count = act->in_hw_count > hw_count ? act->in_hw_count - hw_count : 0; } static unsigned int tcf_offload_act_num_actions_single(struct tc_action *act) { if (is_tcf_pedit(act)) return tcf_pedit_nkeys(act); else return 1; } static bool tc_act_skip_hw(u32 flags) { return (flags & TCA_ACT_FLAGS_SKIP_HW) ? true : false; } static bool tc_act_skip_sw(u32 flags) { return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static bool tc_act_flags_valid(u32 flags) { flags &= TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW; return flags ^ (TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW); } static int offload_action_init(struct flow_offload_action *fl_action, struct tc_action *act, enum offload_act_command cmd, struct netlink_ext_ack *extack) { int err; fl_action->extack = extack; fl_action->command = cmd; fl_action->index = act->tcfa_index; fl_action->cookie = (unsigned long)act; if (act->ops->offload_act_setup) { spin_lock_bh(&act->tcfa_lock); err = act->ops->offload_act_setup(act, fl_action, NULL, false, extack); spin_unlock_bh(&act->tcfa_lock); return err; } return -EOPNOTSUPP; } static int tcf_action_offload_cmd_ex(struct flow_offload_action *fl_act, u32 *hw_count) { int err; err = flow_indr_dev_setup_offload(NULL, NULL, TC_SETUP_ACT, fl_act, NULL, NULL); if (err < 0) return err; if (hw_count) *hw_count = err; return 0; } static int tcf_action_offload_cmd_cb_ex(struct flow_offload_action *fl_act, u32 *hw_count, flow_indr_block_bind_cb_t *cb, void *cb_priv) { int err; err = cb(NULL, NULL, cb_priv, TC_SETUP_ACT, NULL, fl_act, NULL); if (err < 0) return err; if (hw_count) *hw_count = 1; return 0; } static int tcf_action_offload_cmd(struct flow_offload_action *fl_act, u32 *hw_count, flow_indr_block_bind_cb_t *cb, void *cb_priv) { return cb ? tcf_action_offload_cmd_cb_ex(fl_act, hw_count, cb, cb_priv) : tcf_action_offload_cmd_ex(fl_act, hw_count); } static int tcf_action_offload_add_ex(struct tc_action *action, struct netlink_ext_ack *extack, flow_indr_block_bind_cb_t *cb, void *cb_priv) { bool skip_sw = tc_act_skip_sw(action->tcfa_flags); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; struct flow_offload_action *fl_action; u32 in_hw_count = 0; int num, err = 0; if (tc_act_skip_hw(action->tcfa_flags)) return 0; num = tcf_offload_act_num_actions_single(action); fl_action = offload_action_alloc(num); if (!fl_action) return -ENOMEM; err = offload_action_init(fl_action, action, FLOW_ACT_REPLACE, extack); if (err) goto fl_err; err = tc_setup_action(&fl_action->action, actions, 0, extack); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to setup tc actions for offload"); goto fl_err; } err = tcf_action_offload_cmd(fl_action, &in_hw_count, cb, cb_priv); if (!err) cb ? offload_action_hw_count_inc(action, in_hw_count) : offload_action_hw_count_set(action, in_hw_count); if (skip_sw && !tc_act_in_hw(action)) err = -EINVAL; tc_cleanup_offload_action(&fl_action->action); fl_err: kfree(fl_action); return err; } /* offload the tc action after it is inserted */ static int tcf_action_offload_add(struct tc_action *action, struct netlink_ext_ack *extack) { return tcf_action_offload_add_ex(action, extack, NULL, NULL); } int tcf_action_update_hw_stats(struct tc_action *action) { struct flow_offload_action fl_act = {}; int err; err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL); if (err) return err; err = tcf_action_offload_cmd(&fl_act, NULL, NULL, NULL); if (!err) { preempt_disable(); tcf_action_stats_update(action, fl_act.stats.bytes, fl_act.stats.pkts, fl_act.stats.drops, fl_act.stats.lastused, true); preempt_enable(); action->used_hw_stats = fl_act.stats.used_hw_stats; action->used_hw_stats_valid = true; } else { return -EOPNOTSUPP; } return 0; } EXPORT_SYMBOL(tcf_action_update_hw_stats); static int tcf_action_offload_del_ex(struct tc_action *action, flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_offload_action fl_act = {}; u32 in_hw_count = 0; int err = 0; if (!tc_act_in_hw(action)) return 0; err = offload_action_init(&fl_act, action, FLOW_ACT_DESTROY, NULL); if (err) return err; err = tcf_action_offload_cmd(&fl_act, &in_hw_count, cb, cb_priv); if (err < 0) return err; if (!cb && action->in_hw_count != in_hw_count) return -EINVAL; /* do not need to update hw state when deleting action */ if (cb && in_hw_count) offload_action_hw_count_dec(action, in_hw_count); return 0; } static int tcf_action_offload_del(struct tc_action *action) { return tcf_action_offload_del_ex(action, NULL, NULL); } static void tcf_action_cleanup(struct tc_action *p) { tcf_action_offload_del(p); if (p->ops->cleanup) p->ops->cleanup(p); gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } static int __tcf_action_put(struct tc_action *p, bool bind) { struct tcf_idrinfo *idrinfo = p->idrinfo; if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) { if (bind) atomic_dec(&p->tcfa_bindcnt); idr_remove(&idrinfo->action_idr, p->tcfa_index); mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); return 1; } if (bind) atomic_dec(&p->tcfa_bindcnt); return 0; } static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; /* Release with strict==1 and bind==0 is only called through act API * interface (classifiers always bind). Only case when action with * positive reference count and zero bind count can exist is when it was * also created with act API (unbinding last classifier will destroy the * action if it was created by classifier). So only case when bind count * can be changed after initial check is when unbound action is * destroyed by act API while classifier binds to action with same id * concurrently. This result either creation of new action(same behavior * as before), or reusing existing action if concurrent process * increments reference count before action is deleted. Both scenarios * are acceptable. */ if (p) { if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; } return ret; } int tcf_idr_release(struct tc_action *a, bool bind) { const struct tc_action_ops *ops = a->ops; int ret; ret = __tcf_idr_release(a, bind, false); if (ret == ACT_P_DELETED) module_put(ops->owner); return ret; } EXPORT_SYMBOL(tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { struct tc_cookie *user_cookie; u32 cookie_len = 0; rcu_read_lock(); user_cookie = rcu_dereference(act->user_cookie); if (user_cookie) cookie_len = nla_total_size(user_cookie->len); rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) /* TCA_STATS_PKT64 */ + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_ACT_OPTIONS nested */ + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */ } static size_t tcf_action_full_attrs_size(size_t sz) { return NLMSG_HDRLEN /* struct nlmsghdr */ + sizeof(struct tcamsg) + nla_total_size(0) /* TCA_ACT_TAB nested */ + sz; } static size_t tcf_action_fill_size(const struct tc_action *act) { size_t sz = tcf_action_shared_attrs_size(act); if (act->ops->get_fill_size) return act->ops->get_fill_size(act) + sz; return sz; } static int tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act) { unsigned char *b = skb_tail_pointer(skb); struct tc_cookie *cookie; if (nla_put_string(skb, TCA_ACT_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index)) goto nla_put_failure; rcu_read_lock(); cookie = rcu_dereference(a->user_cookie); if (cookie) { if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); return 0; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, s_i = 0, n_i = 0; u32 act_flags = cb->args[2]; unsigned long jiffy_since = cb->args[3]; struct nlattr *nest; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; unsigned long tmp; mutex_lock(&idrinfo->lock); s_i = cb->args[0]; idr_for_each_entry_ul(idr, p, tmp, id) { index++; if (index < s_i) continue; if (IS_ERR(p)) continue; if (jiffy_since && time_after(jiffy_since, (unsigned long)p->tcfa_tm.lastuse)) continue; tcf_action_update_hw_stats(p); nest = nla_nest_start_noflag(skb, n_i); if (!nest) { index--; goto nla_put_failure; } err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ? tcf_action_dump_terse(skb, p, true) : tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); goto done; } nla_nest_end(skb, nest); n_i++; if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) && n_i >= TCA_ACT_MAX_PRIO) goto done; } done: if (index >= 0) cb->args[0] = index + 1; mutex_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } return n_i; nla_put_failure: nla_nest_cancel(skb, nest); goto done; } static int tcf_idr_release_unsafe(struct tc_action *p) { if (atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (refcount_dec_and_test(&p->tcfa_refcnt)) { idr_remove(&p->idrinfo->action_idr, p->tcfa_index); tcf_action_cleanup(p); return ACT_P_DELETED; } return 0; } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct nlattr *nest; int n_i = 0; int ret = -EINVAL; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; unsigned long tmp; nest = nla_nest_start_noflag(skb, 0); if (nest == NULL) goto nla_put_failure; if (nla_put_string(skb, TCA_ACT_KIND, ops->kind)) goto nla_put_failure; ret = 0; mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p)) continue; ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) break; n_i++; } mutex_unlock(&idrinfo->lock); if (ret < 0) { if (n_i) NL_SET_ERR_MSG(extack, "Unable to flush all TC actions"); else goto nla_put_failure; } ret = nla_put_u32(skb, TCA_FCNT, n_i); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); return n_i; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { return tcf_del_walker(idrinfo, skb, ops, extack); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown command %d\n", type); NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command"); return -EINVAL; } } EXPORT_SYMBOL(tcf_generic_walker); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (IS_ERR(p)) p = NULL; else if (p) refcount_inc(&p->tcfa_refcnt); mutex_unlock(&idrinfo->lock); if (p) { *a = p; return true; } return false; } EXPORT_SYMBOL(tcf_idr_search); static int __tcf_generic_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ops->net_id); if (unlikely(ops->walk)) return ops->walk(net, skb, cb, type, ops, extack); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int __tcf_idr_search(struct net *net, const struct tc_action_ops *ops, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ops->net_id); if (unlikely(ops->lookup)) return ops->lookup(net, a, index); return tcf_idr_search(tn, a, index); } static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) { struct tc_action *p; int ret = 0; mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (!p) { mutex_unlock(&idrinfo->lock); return -ENOENT; } if (!atomic_read(&p->tcfa_bindcnt)) { if (refcount_dec_and_test(&p->tcfa_refcnt)) { struct module *owner = p->ops->owner; WARN_ON(p != idr_remove(&idrinfo->action_idr, p->tcfa_index)); mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); module_put(owner); return 0; } ret = 0; } else { ret = -EPERM; } mutex_unlock(&idrinfo->lock); return ret; } int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; refcount_set(&p->tcfa_refcnt, 1); if (bind) atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } gnet_stats_basic_sync_init(&p->tcfa_bstats); gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, &p->tcfa_lock, false, est); if (err) goto err4; } p->idrinfo = idrinfo; __module_get(ops->owner); p->ops = ops; *a = p; return 0; err4: free_percpu(p->cpu_qstats); err3: free_percpu(p->cpu_bstats_hw); err2: free_percpu(p->cpu_bstats); err1: kfree(p); return err; } EXPORT_SYMBOL(tcf_idr_create); int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags) { /* Set cpustats according to actions flags. */ return tcf_idr_create(tn, index, est, a, ops, bind, !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); } EXPORT_SYMBOL(tcf_idr_create_from_flags); /* Cleanup idr index that was allocated but not initialized. */ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; mutex_lock(&idrinfo->lock); /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_cleanup); /* Check if action with specified index exists. If actions is found, increments * its reference and bind counters, and return 1. Otherwise insert temporary * error pointer (to prevent concurrent users from inserting actions with same * index) and return 0. * * May return -EAGAIN for binding actions in case of a parallel add/delete on * the requested index. */ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; int ret; u32 max; if (*index) { again: rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ rcu_read_unlock(); goto again; } if (!p) { /* Empty slot, try to allocate it */ max = *index; rcu_read_unlock(); goto new; } if (!refcount_inc_not_zero(&p->tcfa_refcnt)) { /* Action was deleted in parallel */ rcu_read_unlock(); return -EAGAIN; } if (bind) atomic_inc(&p->tcfa_bindcnt); *a = p; rcu_read_unlock(); return 1; } else { /* Find a slot */ *index = 1; max = UINT_MAX; } new: *a = NULL; mutex_lock(&idrinfo->lock); ret = idr_alloc_u32(&idrinfo->action_idr, ERR_PTR(-EBUSY), index, max, GFP_KERNEL); mutex_unlock(&idrinfo->lock); /* N binds raced for action allocation, * retry for all the ones that failed. */ if (ret == -ENOSPC && *index == max) ret = -EAGAIN; return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; struct tc_action *p; int ret; unsigned long id = 1; unsigned long tmp; idr_for_each_entry_ul(idr, p, tmp, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); /* since act ops id is stored in pernet subsystem list, * then there is no way to walk through only all the action * subsystem, so we keep tc action pernet ops id for * reoffload to walk through. */ static LIST_HEAD(act_pernet_id_list); static DEFINE_MUTEX(act_id_mutex); struct tc_act_pernet_id { struct list_head list; unsigned int id; }; static int tcf_pernet_add_id_list(unsigned int id) { struct tc_act_pernet_id *id_ptr; int ret = 0; mutex_lock(&act_id_mutex); list_for_each_entry(id_ptr, &act_pernet_id_list, list) { if (id_ptr->id == id) { ret = -EEXIST; goto err_out; } } id_ptr = kzalloc(sizeof(*id_ptr), GFP_KERNEL); if (!id_ptr) { ret = -ENOMEM; goto err_out; } id_ptr->id = id; list_add_tail(&id_ptr->list, &act_pernet_id_list); err_out: mutex_unlock(&act_id_mutex); return ret; } static void tcf_pernet_del_id_list(unsigned int id) { struct tc_act_pernet_id *id_ptr; mutex_lock(&act_id_mutex); list_for_each_entry(id_ptr, &act_pernet_id_list, list) { if (id_ptr->id == id) { list_del(&id_ptr->list); kfree(id_ptr); break; } } mutex_unlock(&act_id_mutex); } int tcf_register_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int ret; if (!act->act || !act->dump || !act->init) return -EINVAL; /* We have to register pernet ops before making the action ops visible, * otherwise tcf_action_init_1() could get a partially initialized * netns. */ ret = register_pernet_subsys(ops); if (ret) return ret; if (ops->id) { ret = tcf_pernet_add_id_list(*ops->id); if (ret) goto err_id; } write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) { ret = -EEXIST; goto err_out; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; err_out: write_unlock(&act_mod_lock); if (ops->id) tcf_pernet_del_id_list(*ops->id); err_id: unregister_pernet_subsys(ops); return ret; } EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); err = 0; break; } } write_unlock(&act_mod_lock); if (!err) { unregister_pernet_subsys(ops); if (ops->id) tcf_pernet_del_id_list(*ops->id); } return err; } EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /* lookup by nlattr */ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /*TCA_ACT_MAX_PRIO is 32, there count up to 32 */ #define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) { u32 jmp_prgcnt = 0; u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */ int i; int ret = TC_ACT_OK; if (skb_skip_tc_classify(skb)) return TC_ACT_OK; restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; continue; } if (tc_act_skip_sw(a->tcfa_flags)) continue; repeat_ttl = 32; repeat: ret = tc_act(skb, a, res); if (unlikely(ret == TC_ACT_REPEAT)) { if (--repeat_ttl != 0) goto repeat; /* suspicious opcode, stop pipeline */ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); return TC_ACT_OK; } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { /* faulty opcode, stop pipeline */ return TC_ACT_OK; } else { jmp_ttl -= 1; if (jmp_ttl > 0) goto restart_act_graph; else /* faulty graph, stop pipeline */ return TC_ACT_OK; } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { if (unlikely(!rcu_access_pointer(a->goto_chain))) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_CHAIN_NOTFOUND); return TC_ACT_SHOT; } tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) break; } return ret; } EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; struct tc_action *a; int ret = 0, i; tcf_act_for_each_action(i, a, actions) { actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return ret; } return ret; } static int tcf_action_put(struct tc_action *p) { return __tcf_action_put(p, false); } static void tcf_action_put_many(struct tc_action *actions[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; if (tcf_action_put(a)) module_put(ops->owner); } } static void tca_put_bound_many(struct tc_action *actions[], int init_res[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; if (init_res[i] == ACT_P_CREATED) continue; if (tcf_action_put(a)) module_put(ops->owner); } } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { return a->ops->dump(skb, a, bind, ref); } int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; u32 flags; if (tcf_action_dump_terse(skb, a, false)) goto nla_put_failure; if (a->hw_stats != TCA_ACT_HW_STATS_ANY && nla_put_bitfield32(skb, TCA_ACT_HW_STATS, a->hw_stats, TCA_ACT_HW_STATS_ANY)) goto nla_put_failure; if (a->used_hw_stats_valid && nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) goto nla_put_failure; flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; if (flags && nla_put_bitfield32(skb, TCA_ACT_FLAGS, flags, flags)) goto nla_put_failure; if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_old(skb, a, bind, ref); if (err > 0) { nla_nest_end(skb, nest); return err; } nla_put_failure: nlmsg_trim(skb, b); return -1; } EXPORT_SYMBOL(tcf_action_dump_1); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse) { struct tc_action *a; int err = -EINVAL, i; struct nlattr *nest; tcf_act_for_each_action(i, a, actions) { nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; err = terse ? tcf_action_dump_terse(skb, a, false) : tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); } return 0; nla_put_failure: err = -EINVAL; errout: nla_nest_cancel(skb, nest); return err; } static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) { struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return NULL; c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); if (!c->data) { kfree(c); return NULL; } c->len = nla_len(tb[TCA_ACT_COOKIE]); return c; } static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) { struct nla_bitfield32 hw_stats_bf; /* If the user did not pass the attr, that means he does * not care about the type. Return "any" in that case * which is setting on all supported types. */ if (!hw_stats_attr) return TCA_ACT_HW_STATS_ANY; hw_stats_bf = nla_get_bitfield32(hw_stats_attr); return hw_stats_bf.value; } static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS | TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW), [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { struct tcf_idrinfo *idrinfo; if (init_res[i] == ACT_P_BOUND) continue; idrinfo = a->idrinfo; mutex_lock(&idrinfo->lock); /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ idr_replace(&idrinfo->action_idr, a, a->tcfa_index); mutex_unlock(&idrinfo->lock); } } struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack) { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; struct nlattr *kind; int err; if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) return ERR_PTR(err); err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); return ERR_PTR(err); } if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(err); } } else { if (strscpy(act_name, "police", IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(-EINVAL); } } a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); if (rtnl_held) rtnl_unlock(); request_module(NET_ACT_ALIAS_PREFIX "%s", act_name); if (rtnl_held) rtnl_lock(); a_o = tc_lookup_action_n(act_name); /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. */ if (a_o != NULL) { module_put(a_o->owner); return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); return ERR_PTR(-ENOENT); } return a_o; } struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action_ops *a_o, int *init_res, u32 flags, struct netlink_ext_ack *extack) { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nla_bitfield32 userflags = { 0, 0 }; struct tc_cookie *user_cookie = NULL; u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; int err; /* backward compatibility for policer */ if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) return ERR_PTR(err); if (tb[TCA_ACT_COOKIE]) { user_cookie = nla_memdup_cookie(tb); if (!user_cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; } } hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) { userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); if (!tc_act_flags_valid(userflags.value)) { err = -EINVAL; goto err_out; } } err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp, userflags.value | flags, extack); } else { err = a_o->init(net, nla, est, &a, tp, userflags.value | flags, extack); } if (err < 0) goto err_out; *init_res = err; if (!police && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->user_cookie, user_cookie); if (!police) a->hw_stats = hw_stats; return a; err_out: if (user_cookie) { kfree(user_cookie->data); kfree(user_cookie); } return ERR_PTR(err); } static bool tc_act_bind(u32 flags) { return !!(flags & TCA_ACT_FLAGS_BIND); } /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t sz = 0; int err; int i; err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (err < 0) return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; a_o = tc_action_load_ops(tb[i], flags, extack); if (IS_ERR(a_o)) { err = PTR_ERR(a_o); goto err_mod; } ops[i - 1] = a_o; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1], &init_res[i - 1], flags, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; if (tc_act_bind(flags)) { bool skip_sw = tc_skip_sw(fl_flags); bool skip_hw = tc_skip_hw(fl_flags); if (tc_act_bind(act->tcfa_flags)) continue; if (skip_sw != tc_act_skip_sw(act->tcfa_flags) || skip_hw != tc_act_skip_hw(act->tcfa_flags)) { NL_SET_ERR_MSG(extack, "Mismatch between action and filter offload flags"); err = -EINVAL; goto err; } } else { err = tcf_action_offload_add(act, extack); if (tc_act_skip_sw(act->tcfa_flags) && err) goto err; } } /* We have to commit them all together, because if any error happened in * between, we could not handle the failure gracefully. */ tcf_idr_insert_many(actions, init_res); *attr_size = tcf_action_full_attrs_size(sz); err = i - 1; goto err_mod; err: tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND); err_mod: for (i = 0; i < TCA_ACT_MAX_PRIO && ops[i]; i++) module_put(ops[i]->owner); return err; } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw) { if (a->cpu_bstats) { _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), bytes, packets); return; } _bstats_update(&a->tcfa_bstats, bytes, packets); a->tcfa_qstats.drops += drops; if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { int err = 0; struct gnet_dump d; if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, p->tcfa_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) goto errout; return 0; errout: return -1; } static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); if (!nlh) goto out_nlmsg_trim; t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; if (extack && extack->_msg && nla_put_string(skb, TCA_ROOT_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } return rtnl_unicast(skb, net, portid); } static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct tc_action *a; int index; int err; err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; if (tb[TCA_ACT_INDEX] == NULL || nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) { NL_SET_ERR_MSG(extack, "Invalid TC action index value"); goto err_out; } index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (!ops) { /* could happen in batch of actions */ NL_SET_ERR_MSG(extack, "Specified TC action kind not found"); goto err_out; } err = -ENOENT; if (__tcf_idr_search(net, ops, &a, index) == 0) { NL_SET_ERR_MSG(extack, "TC action with specified index not found"); goto err_mod; } module_put(ops->owner); return a; err_mod: module_put(ops->owner); err_out: return ERR_PTR(err); } static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct sk_buff *skb; unsigned char *b; struct nlmsghdr *nlh; struct tcamsg *t; struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct nlattr *kind; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return err; b = skb_tail_pointer(skb); err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; ops = tc_lookup_action(kind); if (!ops) { /*some idjot trying to flush unknown action */ NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action"); goto err_out; } nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); if (!nlh) { NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification"); goto out_module_put; } t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) { NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; } err = __tcf_generic_walker(net, skb, &dcb, RTM_DELACTION, ops, extack); if (err <= 0) { nla_nest_cancel(skb, nest); goto out_module_put; } nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); return err; out_module_put: module_put(ops->owner); err_out: kfree_skb(skb); return err; } static int tcf_action_delete(struct net *net, struct tc_action *actions[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their * type and id to search again after reference is released. */ struct tcf_idrinfo *idrinfo = a->idrinfo; u32 act_index = a->tcfa_index; actions[i] = NULL; if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { int ret; /* now do the delete */ ret = tcf_idr_delete_index(idrinfo, act_index); if (ret < 0) return ret; } } return 0; } static struct sk_buff *tcf_reoffload_del_notify_msg(struct net *net, struct tc_action *action) { size_t attr_size = tcf_action_fill_size(action); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_reoffload_del_notify(struct net *net, struct tc_action *action) { const struct tc_action_ops *ops = action->ops; struct sk_buff *skb; int ret; if (!rtnl_notify_needed(net, 0, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_reoffload_del_notify_msg(net, action); if (IS_ERR(skb)) return PTR_ERR(skb); } ret = tcf_idr_release_unsafe(action); if (ret == ACT_P_DELETED) { module_put(ops->owner); ret = rtnetlink_maybe_send(skb, net, 0, RTNLGRP_TC, 0); } else { kfree_skb(skb); } return ret; } int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add) { struct tc_act_pernet_id *id_ptr; struct tcf_idrinfo *idrinfo; struct tc_action_net *tn; struct tc_action *p; unsigned int act_id; unsigned long tmp; unsigned long id; struct idr *idr; struct net *net; int ret; if (!cb) return -EINVAL; down_read(&net_rwsem); mutex_lock(&act_id_mutex); for_each_net(net) { list_for_each_entry(id_ptr, &act_pernet_id_list, list) { act_id = id_ptr->id; tn = net_generic(net, act_id); if (!tn) continue; idrinfo = tn->idrinfo; if (!idrinfo) continue; mutex_lock(&idrinfo->lock); idr = &idrinfo->action_idr; idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p) || tc_act_bind(p->tcfa_flags)) continue; if (add) { tcf_action_offload_add_ex(p, NULL, cb, cb_priv); continue; } /* cb unregister to update hw count */ ret = tcf_action_offload_del_ex(p, cb, cb_priv); if (ret < 0) continue; if (tc_act_skip_sw(p->tcfa_flags) && !tc_act_in_hw(p)) tcf_reoffload_del_notify(net, p); } mutex_unlock(&idrinfo->lock); } } mutex_unlock(&act_id_mutex); up_read(&net_rwsem); return 0; } static struct sk_buff *tcf_del_notify_msg(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; int ret; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_del_notify_msg(net, n, actions, portid, attr_size, extack); if (IS_ERR(skb)) return PTR_ERR(skb); } /* now do the delete */ ret = tcf_action_delete(net, actions); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); return ret; } return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int event, struct netlink_ext_ack *extack) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; ret = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1]) return tca_action_flush(net, tb[1], n, portid, extack); NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action"); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_get_1(net, tb[i], n, portid, extack); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; } attr_size += tcf_action_fill_size(act); actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ ret = tcf_del_notify(net, n, actions, portid, attr_size, extack); if (ret) goto err; return 0; } err: tcf_action_put_many(actions); return ret; } static struct sk_buff *tcf_add_notify_msg(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_add_notify_msg(net, n, actions, portid, attr_size, extack); if (IS_ERR(skb)) return PTR_ERR(skb); } return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, u32 flags, struct netlink_ext_ack *extack) { size_t attr_size = 0; int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; int init_res[TCA_ACT_MAX_PRIO] = {}; for (loop = 0; loop < 10; loop++) { ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res, &attr_size, flags, 0, extack); if (ret != -EAGAIN) break; } if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); /* only put bound actions */ tca_put_bound_many(actions, init_res); return ret; } static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON | TCA_ACT_FLAG_TERSE_DUMP), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; u32 flags = 0; int ret = 0; if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; if (tca[TCA_ACT_TAB] == NULL) { NL_SET_ERR_MSG(extack, "Netlink action attributes missing"); return -EINVAL; } /* n->nlmsg_flags & NLM_F_CREATE */ switch (n->nlmsg_type) { case RTM_NEWACTION: /* we are going to assume all other flags * imply create only if it doesn't exist * Note that CREATE | EXCL implies that * but since we want avoid ambiguity (eg when flags * is zero) then just set this */ if (n->nlmsg_flags & NLM_F_REPLACE) flags = TCA_ACT_FLAGS_REPLACE; ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags, extack); break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_DELACTION, extack); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_GETACTION, extack); break; default: BUG(); } return ret; } static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct nlattr *kind; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; if (nla_parse_deprecated(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) return NULL; if (tb[1] == NULL) return NULL; if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; return kind; } static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *tb[TCA_ROOT_MAX + 1]; struct nlattr *count_attr = NULL; unsigned long jiffy_since = 0; struct nlattr *kind = NULL; struct nla_bitfield32 bf; u32 msecs_since = 0; u32 act_count = 0; ret = nlmsg_parse_deprecated(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, tcaa_policy, cb->extack); if (ret < 0) return ret; kind = find_dump_kind(tb); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; } a_o = tc_lookup_action(kind); if (a_o == NULL) return 0; cb->args[2] = 0; if (tb[TCA_ROOT_FLAGS]) { bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]); cb->args[2] = bf.value; } if (tb[TCA_ROOT_TIME_DELTA]) { msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]); } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; if (msecs_since) jiffy_since = jiffies - msecs_to_jiffies(msecs_since); t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; cb->args[3] = jiffy_since; count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); if (!count_attr) goto out_module_put; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (nest == NULL) goto out_module_put; ret = __tcf_generic_walker(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; act_count = cb->args[1]; memcpy(nla_data(count_attr), &act_count, sizeof(u32)); cb->args[1] = 0; } else nlmsg_trim(skb, b); nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; out_module_put: module_put(a_o->owner); nlmsg_trim(skb, b); return skb->len; } static int __init tc_action_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, 0); return 0; } subsys_initcall(tc_action_init);
1 11 2 11 11 11 10 3 62 58 4 4 1 7 162 162 162 1 1 1 1 1557 1558 13 13 162 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 // SPDX-License-Identifier: GPL-2.0-only /* Event cache for netfilter. */ /* * (C) 2005 Harald Welte <laforge@gnumonks.org> * (C) 2005 Patrick McHardy <kaber@trash.net> * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> static DEFINE_MUTEX(nf_ct_ecache_mutex); #define DYING_NULLS_VAL ((1 << 30) + 1) #define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) #define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, STATE_RESTART, STATE_DONE, }; struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); return &cnet->ecache; } #if IS_MODULE(CONFIG_NF_CT_NETLINK) EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); #endif static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) { unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int sent; INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); next: sent = 0; spin_lock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* The worker owns all entries, ct remains valid until nf_ct_put * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } if (sent++ > 16) { spin_unlock_bh(&cnet->ecache.dying_lock); cond_resched(); goto next; } } spin_unlock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); nf_ct_put(ct); cond_resched(); } return ret; } static void ecache_work(struct work_struct *work) { struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); int ret, delay = -1; ret = ecache_work_evict_list(cnet); switch (ret) { case STATE_CONGESTED: delay = ECACHE_RETRY_JIFFIES; break; case STATE_RESTART: delay = 0; break; case STATE_DONE: break; } if (delay >= 0) schedule_delayed_work(&cnet->ecache.dwork, delay); } static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, const u32 events, const u32 missed, const struct nf_ct_event *item) { struct net *net = nf_ct_net(item->ct); struct nf_ct_event_notifier *notify; u32 old, want; int ret; if (!((events | missed) & e->ctmask)) return 0; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) { rcu_read_unlock(); return 0; } ret = notify->ct_event(events | missed, item); rcu_read_unlock(); if (likely(ret >= 0 && missed == 0)) return 0; do { old = READ_ONCE(e->missed); if (ret < 0) want = old | events; else want = old & ~missed; } while (cmpxchg(&e->missed, old, want) != old); return ret; } int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int missed; int ret; if (!nf_ct_is_confirmed(ct)) return 0; e = nf_ct_ecache_find(ct); if (!e) return 0; memset(&item, 0, sizeof(item)); item.ct = ct; item.portid = e->portid ? e->portid : portid; item.report = report; /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, * we store the PORTID to include it in the retransmission. */ if (e->portid == 0 && portid != 0) e->portid = portid; } return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) return; e = nf_ct_ecache_find(ct); if (e == NULL) return; events = xchg(&e->cache, 0); item.ct = ct; item.portid = 0; item.report = 0; /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report) { struct net *net = nf_ct_exp_net(exp); struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; e = nf_ct_ecache_find(exp->master); if (!e) goto out_unlock; if (e->expmask & (1 << event)) { struct nf_exp_event item = { .exp = exp, .portid = portid, .report = report }; notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } void nf_conntrack_register_notifier(struct net *net, const struct nf_ct_event_notifier *new) { struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); void nf_conntrack_unregister_notifier(struct net *net) { mutex_lock(&nf_ct_ecache_mutex); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache.dwork)) { schedule_delayed_work(&cnet->ecache.dwork, HZ); net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } } bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; switch (net->ct.sysctl_events) { case 0: /* assignment via template / ruleset? ignore sysctl. */ if (ctmask || expmask) break; return true; case 2: /* autodetect: no event listener, don't allocate extension. */ if (!READ_ONCE(nf_ctnetlink_has_listener)) return true; fallthrough; case 1: /* always allocate an extension. */ if (!ctmask && !expmask) { ctmask = ~0; expmask = ~0; } break; default: WARN_ON_ONCE(1); return true; } e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { e->ctmask = ctmask; e->expmask = expmask; } return e != NULL; } EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); #define NF_CT_EVENTS_DEFAULT 2 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; void nf_conntrack_ecache_pernet_init(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); spin_lock_init(&cnet->ecache.dying_lock); BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } void nf_conntrack_ecache_pernet_fini(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache.dwork); }
2322 2325 5 20 2325 2326 2324 20 20 745 386 362 1347 30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - sysfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" static int sysfs_do_create_link_sd(struct kernfs_node *parent, struct kobject *target_kobj, const char *name, int warn) { struct kernfs_node *kn, *target = NULL; if (WARN_ON(!name || !parent)) return -EINVAL; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (target_kobj->sd) { target = target_kobj->sd; kernfs_get(target); } spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; kn = kernfs_create_link(parent, name, target); kernfs_put(target); if (!IS_ERR(kn)) return 0; if (warn && PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (!parent) return -EFAULT; return sysfs_do_create_link_sd(parent, target, name, warn); } /** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 1); } EXPORT_SYMBOL_GPL(sysfs_create_link); /** * sysfs_create_link_nowarn - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. * * This function does the same as sysfs_create_link(), but it * doesn't warn if the link already exists. */ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 0); } EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @name: name of the symlink to remove. * * Unlike sysfs_remove_link sysfs_delete_link has enough information * to successfully delete symlinks in tagged directories. */ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, const char *name) { const void *ns = NULL; /* * We don't own @target and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (targ->sd && kernfs_ns_enabled(kobj->sd)) ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. */ void sysfs_remove_link(struct kobject *kobj, const char *name) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); /** * sysfs_rename_link_ns - rename symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @old: previous name of the symlink. * @new: new name of the symlink. * @new_ns: new namespace of the symlink. * * A helper function for the common rename symlink idiom. */ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (targ->sd) old_ns = targ->sd->ns; result = -ENOENT; kn = kernfs_find_and_get_ns(parent, old, old_ns); if (!kn) goto out; result = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; result = kernfs_rename_ns(kn, parent, new, new_ns); out: kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
1107 1005 972 973 1105 1046 1046 1046 1047 1157 1107 982 283 1047 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
92 160 160 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2021 Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/sgx.h> #include "cpuid.h" #include "kvm_cache_regs.h" #include "nested.h" #include "sgx.h" #include "vmx.h" #include "x86.h" bool __read_mostly enable_sgx = 1; module_param_named(sgx, enable_sgx, bool, 0444); /* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ static u64 sgx_pubkey_hash[4] __ro_after_init; /* * ENCLS's memory operands use a fixed segment (DS) and a fixed * address size based on the mode. Related prefixes are ignored. */ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, int size, int alignment, gva_t *gva) { struct kvm_segment s; bool fault; /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ *gva = offset; if (!is_64_bit_mode(vcpu)) { vmx_get_segment(vcpu, &s, VCPU_SREG_DS); *gva += s.base; } if (!IS_ALIGNED(*gva, alignment)) { fault = true; } else if (likely(is_64_bit_mode(vcpu))) { *gva = vmx_get_untagged_addr(vcpu, *gva, 0); fault = is_noncanonical_address(*gva, vcpu); } else { *gva &= 0xffffffff; fault = (s.unusable) || (s.type != 2 && s.type != 3) || (*gva > s.limit) || ((s.base != 0 || s.limit != 0xffffffff) && (((u64)*gva + size - 1) > s.limit + 1)); } if (fault) kvm_inject_gp(vcpu, 0); return fault ? -EINVAL : 0; } static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, unsigned int size) { uint64_t data[2] = { addr, size }; __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data)); } static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, unsigned int size) { if (__copy_from_user(data, (void __user *)hva, size)) { sgx_handle_emulation_failure(vcpu, hva, size); return -EFAULT; } return 0; } static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, gpa_t *gpa) { struct x86_exception ex; if (write) *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); else *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); if (*gpa == INVALID_GPA) { kvm_inject_emulated_page_fault(vcpu, &ex); return -EFAULT; } return 0; } static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) { *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); if (kvm_is_error_hva(*hva)) { sgx_handle_emulation_failure(vcpu, gpa, 1); return -EFAULT; } *hva |= gpa & ~PAGE_MASK; return 0; } static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) { struct x86_exception ex; /* * A non-EPCM #PF indicates a bad userspace HVA. This *should* check * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, * but the error code isn't (yet) plumbed through the ENCLS helpers. */ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } /* * If the guest thinks it's running on SGX2 hardware, inject an SGX * #PF if the fault matches an EPCM fault signature (#GP on SGX1, * #PF on SGX2). The assumption is that EPCM faults are much more * likely than a bad userspace address. */ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { memset(&ex, 0, sizeof(ex)); ex.vector = PF_VECTOR; ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | PFERR_SGX_MASK; ex.address = gva; ex.error_code_valid = true; ex.nested_page_fault = false; kvm_inject_emulated_page_fault(vcpu, &ex); } else { kvm_inject_gp(vcpu, 0); } return 1; } static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, struct sgx_pageinfo *pageinfo, unsigned long secs_hva, gva_t secs_gva) { struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; u64 attributes, xfrm, size; u32 miscselect; u8 max_size_log2; int trapnr, ret; sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } miscselect = contents->miscselect; attributes = contents->attributes; xfrm = contents->xfrm; size = contents->size; /* Enforce restriction of access to the PROVISIONKEY. */ if (!vcpu->kvm->arch.sgx_provisioning_allowed && (attributes & SGX_ATTR_PROVISIONKEY)) { if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n"); kvm_inject_gp(vcpu, 0); return 1; } /* * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note * that the allowed XFRM (XFeature Request Mask) isn't strictly bound * by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE * is unsupported, i.e. even if XCR0 itself is completely unsupported. */ if ((u32)miscselect & ~sgx_12_0->ebx || (u32)attributes & ~sgx_12_1->eax || (u32)(attributes >> 32) & ~sgx_12_1->ebx || (u32)xfrm & ~sgx_12_1->ecx || (u32)(xfrm >> 32) & ~sgx_12_1->edx || xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) || (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { kvm_inject_gp(vcpu, 0); return 1; } /* Enforce CPUID restriction on max enclave size. */ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : sgx_12_0->edx; if (size >= BIT_ULL(max_size_log2)) { kvm_inject_gp(vcpu, 0); return 1; } /* * sgx_virt_ecreate() returns: * 1) 0: ECREATE was successful * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the * exception number. * 3) -EINVAL: access_ok() on @secs_hva failed. This should never * happen as KVM checks host addresses at memslot creation. * sgx_virt_ecreate() has already warned in this case. */ ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); if (!ret) return kvm_skip_emulated_instruction(vcpu); if (ret == -EFAULT) return sgx_inject_fault(vcpu, secs_gva, trapnr); return ret; } static int handle_encls_ecreate(struct kvm_vcpu *vcpu) { gva_t pageinfo_gva, secs_gva; gva_t metadata_gva, contents_gva; gpa_t metadata_gpa, contents_gpa, secs_gpa; unsigned long metadata_hva, contents_hva, secs_hva; struct sgx_pageinfo pageinfo; struct sgx_secs *contents; struct x86_exception ex; int r; if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) return 1; /* * Copy the PAGEINFO to local memory, its pointers need to be * translated, i.e. we need to do a deep copy/translate. */ r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, sizeof(pageinfo), &ex); if (r == X86EMUL_PROPAGATE_FAULT) { kvm_inject_emulated_page_fault(vcpu, &ex); return 1; } else if (r != X86EMUL_CONTINUE) { sgx_handle_emulation_failure(vcpu, pageinfo_gva, sizeof(pageinfo)); return 0; } if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, &contents_gva)) return 1; /* * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. * Resume the guest on failure to inject a #PF. */ if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) return 1; /* * ...and then to HVA. The order of accesses isn't architectural, i.e. * KVM doesn't have to fully process one address at a time. Exit to * userspace if a GPA is invalid. */ if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) return 0; /* * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to * enforce restriction of access to the PROVISIONKEY. */ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!contents) return -ENOMEM; /* Exit to userspace if copying from a host userspace address fails. */ if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { free_page((unsigned long)contents); return 0; } pageinfo.metadata = metadata_hva; pageinfo.contents = (u64)contents; r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); free_page((unsigned long)contents); return r; } static int handle_encls_einit(struct kvm_vcpu *vcpu) { unsigned long sig_hva, secs_hva, token_hva, rflags; struct vcpu_vmx *vmx = to_vmx(vcpu); gva_t sig_gva, secs_gva, token_gva; gpa_t sig_gpa, secs_gpa, token_gpa; int ret, trapnr; if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) return 1; /* * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. * Resume the guest on failure to inject a #PF. */ if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) return 1; /* * ...and then to HVA. The order of accesses isn't architectural, i.e. * KVM doesn't have to fully process one address at a time. Exit to * userspace if a GPA is invalid. Note, all structures are aligned and * cannot split pages. */ if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) return 0; ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, (void __user *)secs_hva, vmx->msr_ia32_sgxlepubkeyhash, &trapnr); if (ret == -EFAULT) return sgx_inject_fault(vcpu, secs_gva, trapnr); /* * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, * @token_hva or @secs_hva. This should never happen as KVM checks host * addresses at memslot creation. sgx_virt_einit() has already warned * in this case, so just return. */ if (ret < 0) return ret; rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF); if (ret) rflags |= X86_EFLAGS_ZF; else rflags &= ~X86_EFLAGS_ZF; vmx_set_rflags(vcpu, rflags); kvm_rax_write(vcpu, ret); return kvm_skip_emulated_instruction(vcpu); } static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { /* * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will * be reached if and only if the SGX1 leafs are enabled. */ if (leaf >= ECREATE && leaf <= ETRACK) return true; if (leaf >= EAUG && leaf <= EMODT) return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); return false; } static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) { const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; } int handle_encls(struct kvm_vcpu *vcpu) { u32 leaf = (u32)kvm_rax_read(vcpu); if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) || !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { kvm_queue_exception(vcpu, UD_VECTOR); } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) || !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { kvm_inject_gp(vcpu, 0); } else { if (leaf == ECREATE) return handle_encls_ecreate(vcpu); if (leaf == EINIT) return handle_encls_einit(vcpu); WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; return 0; } return 1; } void setup_default_sgx_lepubkeyhash(void) { /* * Use Intel's default value for Skylake hardware if Launch Control is * not supported, i.e. Intel's hash is hardcoded into silicon, or if * Launch Control is supported and enabled, i.e. mimic the reset value * and let the guest write the MSRs at will. If Launch Control is * supported but disabled, then use the current MSR values as the hash * MSRs exist but are read-only (locked and not writable). */ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; } else { /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); } } void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, sizeof(sgx_pubkey_hash)); } /* * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM * restrictions if the guest's allowed-1 settings diverge from hardware. */ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *guest_cpuid; u32 eax, ebx, ecx, edx; if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) return true; return false; } void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { /* * There is no software enable bit for SGX that is virtualized by * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the * guest (either by the host or by the guest's BIOS) but enabled in the * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate * the expected system behavior for ENCLS. */ u64 bitmap = -1ull; /* Nothing to do if hardware doesn't support SGX */ if (!cpu_has_vmx_encls_vmexit()) return; if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && sgx_enabled_in_guest_bios(vcpu)) { if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); if (sgx_intercept_encls_ecreate(vcpu)) bitmap |= (1 << ECREATE); } if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) bitmap &= ~GENMASK_ULL(EMODT, EAUG); /* * Trap and execute EINIT if launch control is enabled in the * host using the guest's values for launch control MSRs, even * if the guest's values are fixed to hardware default values. * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing * the MSRs is extraordinarily expensive. */ if (boot_cpu_has(X86_FEATURE_SGX_LC)) bitmap |= (1 << EINIT); if (!vmcs12 && is_guest_mode(vcpu)) vmcs12 = get_vmcs12(vcpu); if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) bitmap |= vmcs12->encls_exiting_bitmap; } vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 // SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/genetlink.h> static const struct genl_multicast_group quota_mcgrps[] = { { .name = "events", }, }; /* Netlink family structure for quota */ static struct genl_family quota_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, .maxattr = QUOTA_NL_A_MAX, .mcgrps = quota_mcgrps, .n_mcgrps = ARRAY_SIZE(quota_mcgrps), }; /** * quota_send_warning - Send warning to userspace about exceeded quota * @qid: The kernel internal quota identifier. * @dev: The device on which the fs is mounted (sb->s_dev) * @warntype: The type of the warning: QUOTA_NL_... * * This can be used by filesystems (including those which don't use * dquot) to send a message to userspace relating to quota limits. * */ void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { static atomic_t seq; struct sk_buff *skb; void *msg_head; int ret; int msg_size = 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size_64bit(sizeof(u64)); /* We have to allocate using GFP_NOFS as we are called from a * filesystem performing write and thus further recursion into * the fs to free some data could cause deadlocks. */ skb = genlmsg_new(msg_size, GFP_NOFS); if (!skb) { printk(KERN_ERR "VFS: Not enough memory to send quota warning.\n"); return; } msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), &quota_genl_family, 0, QUOTA_NL_C_WARNING); if (!msg_head) { printk(KERN_ERR "VFS: Cannot store netlink header in quota warning.\n"); goto err_out; } ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_EXCESS_ID, from_kqid_munged(&init_user_ns, qid), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_CAUSED_ID, from_kuid_munged(&init_user_ns, current_uid()), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; genlmsg_end(skb, msg_head); genlmsg_multicast(&quota_genl_family, skb, 0, 0, GFP_NOFS); return; attr_err_out: printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); err_out: kfree_skb(skb); } EXPORT_SYMBOL(quota_send_warning); static int __init quota_init(void) { if (genl_register_family(&quota_genl_family) != 0) printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n"); return 0; }; fs_initcall(quota_init);
12 1 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * Lauro Ramos Venancio <lauro.venancio@openbossa.org> */ #include <linux/nfc.h> #include <linux/module.h> #include "nfc.h" static DEFINE_RWLOCK(proto_tab_lock); static const struct nfc_protocol *proto_tab[NFC_SOCKPROTO_MAX]; static int nfc_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int rc = -EPROTONOSUPPORT; if (net != &init_net) return -EAFNOSUPPORT; if (proto < 0 || proto >= NFC_SOCKPROTO_MAX) return -EINVAL; read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); return rc; } static const struct net_proto_family nfc_sock_family_ops = { .owner = THIS_MODULE, .family = PF_NFC, .create = nfc_sock_create, }; int nfc_proto_register(const struct nfc_protocol *nfc_proto) { int rc; if (nfc_proto->id < 0 || nfc_proto->id >= NFC_SOCKPROTO_MAX) return -EINVAL; rc = proto_register(nfc_proto->proto, 0); if (rc) return rc; write_lock(&proto_tab_lock); if (proto_tab[nfc_proto->id]) rc = -EBUSY; else proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); if (rc) proto_unregister(nfc_proto->proto); return rc; } EXPORT_SYMBOL(nfc_proto_register); void nfc_proto_unregister(const struct nfc_protocol *nfc_proto) { write_lock(&proto_tab_lock); proto_tab[nfc_proto->id] = NULL; write_unlock(&proto_tab_lock); proto_unregister(nfc_proto->proto); } EXPORT_SYMBOL(nfc_proto_unregister); int __init af_nfc_init(void) { return sock_register(&nfc_sock_family_ops); } void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); }
2361 663 10 10 160 120 120 120 120 47 47 47 42 47 543 542 543 543 543 2169 1894 541 542 541 2162 1864 530 261 1063 1048 543 545 544 544 120 120 120 120 120 120 120 10 10 10 541 546 2437 2440 120 120 120 120 31 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. * Copyright (c) 2023 Greg Kroah-Hartman <gregkh@linuxfoundation.org> */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; /* /sys/bus */ static struct kset *bus_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); /** * bus_to_subsys - Turn a struct bus_type into a struct subsys_private * * @bus: pointer to the struct bus_type to look up * * The driver core internals needs to work on the subsys_private structure, not * the external struct bus_type pointer. This function walks the list of * registered busses in the system and finds the matching one and returns the * internal struct subsys_private that relates to that bus. * * Note, the reference count of the return value is INCREMENTED if it is not * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ static struct subsys_private *bus_to_subsys(const struct bus_type *bus) { struct subsys_private *sp = NULL; struct kobject *kobj; if (!bus || !bus_kset) return NULL; spin_lock(&bus_kset->list_lock); if (list_empty(&bus_kset->list)) goto done; list_for_each_entry(kobj, &bus_kset->list, entry) { struct kset *kset = container_of(kobj, struct kset, kobj); sp = container_of_const(kset, struct subsys_private, subsys); if (sp->bus == bus) goto done; } sp = NULL; done: sp = subsys_get(sp); spin_unlock(&bus_kset->list_lock); return sp; } static const struct bus_type *bus_get(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); if (sp) return bus; return NULL; } static void bus_put(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); /* two puts are required as the call to bus_to_subsys incremented it again */ subsys_put(sp); subsys_put(sp); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static const struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); ssize_t ret = 0; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); ssize_t ret = 0; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); int error; if (!sp) return -EINVAL; error = sysfs_create_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return; sysfs_remove_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); lockdep_unregister_key(&priv->lock_key); kfree(priv); } static const struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (!err) { /* success */ err = count; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = sysfs_emit(buf, "%d\n", sp->drivers_autoprobe); subsys_put(sp); return ret; } static ssize_t drivers_autoprobe_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return -EINVAL; if (buf[0] == '0') sp->drivers_autoprobe = 0; else sp->drivers_autoprobe = 1; subsys_put(sp); return count; } static ssize_t drivers_probe_store(const struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; if (!sp) return NULL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) break; klist_iter_exit(&i); subsys_put(sp); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device_driver *drv; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); int error; if (!sp) { /* * This is a normal operation for many devices that do not * have a bus assigned to them, just say that all went * well. */ return 0; } /* * Reference in sp is now incremented and will be dropped when * the device is removed from the bus */ pr_debug("bus: '%s': add device %s\n", sp->bus->name, dev_name(dev)); error = device_add_groups(dev, sp->bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &sp->klist_devices); return 0; out_subsys: sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, sp->bus->dev_groups); out_put: subsys_put(sp); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; if (sp->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&sp->mutex); subsys_put(sp); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&sp->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_device() */ subsys_put(sp); subsys_put(sp); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(const struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(const struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); struct driver_private *priv; int error = 0; if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the driver is removed from the bus */ pr_debug("bus: '%s': add driver %s\n", sp->bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = sp->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &sp->klist_drivers); if (sp->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_del_list; } module_add_driver(drv->owner, drv); error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, sp->bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_add_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_del_list: klist_del(&priv->knode_bus); out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: subsys_put(sp); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); if (!sp) return; pr_debug("bus: '%s': remove driver %s\n", sp->bus->name, drv->name); if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, sp->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_driver() */ subsys_put(sp); subsys_put(sp); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(const struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = kobject_synth_uevent(&sp->subsys.kobj, buf, count); subsys_put(sp); if (ret) return ret; return count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(const struct bus_type *bus) { int retval; struct subsys_private *priv; struct kobject *bus_kobj; struct lock_class_key *key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); bus_kobj = &priv->subsys.kobj; retval = kobject_set_name(bus_kobj, "%s", bus->name); if (retval) goto out; bus_kobj->kset = bus_kset; bus_kobj->ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, bus_kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, bus_kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); key = &priv->lock_key; lockdep_register_key(key); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = sysfs_create_groups(bus_kobj, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(priv->drivers_kset); bus_drivers_fail: kset_unregister(priv->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&priv->subsys); out: kfree(priv); return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *bus_kobj; if (!sp) return; pr_debug("bus: '%s': unregistering\n", bus->name); if (sp->dev_root) device_unregister(sp->dev_root); bus_kobj = &sp->subsys.kobj; sysfs_remove_groups(bus_kobj, bus->bus_groups); remove_probe_files(bus); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(sp->drivers_kset); kset_unregister(sp->devices_kset); kset_unregister(&sp->subsys); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_register(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_unregister(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_unregister_notifier); void bus_notify(struct device *dev, enum bus_notifier_event value) { struct subsys_private *sp = bus_to_subsys(dev->bus); if (!sp) return; blocking_notifier_call_chain(&sp->bus_notifier, value, dev); subsys_put(sp); } struct kset *bus_get_kset(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kset *kset; if (!sp) return NULL; kset = &sp->subsys; subsys_put(sp); return kset; } EXPORT_SYMBOL_GPL(bus_get_kset); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(const struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { struct subsys_private *sp = bus_to_subsys(bus); LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; if (!sp) return; device_klist = &sp->klist_devices; spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); struct subsys_dev_iter { struct klist_iter ki; const struct device_type *type; }; /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @sp: the subsys private (i.e. bus) we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ static void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct subsys_private *sp, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; } /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ static struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ static void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } int subsys_interface_register(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; sp = bus_to_subsys(sif->subsys); if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the interface is removed from the bus */ mutex_lock(&sp->mutex); list_add_tail(&sif->node, &sp->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; sp = bus_to_subsys(sif->subsys); if (!sp) return; mutex_lock(&sp->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in subsys_interface_register() */ subsys_put(sp); subsys_put(sp); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(const struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct subsys_private *sp; struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; sp = bus_to_subsys(subsys); if (!sp) { err = -EINVAL; goto err_sp; } dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; sp->dev_root = dev; subsys_put(sp); return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: subsys_put(sp); err_sp: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(NULL); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); /** * driver_find - locate driver on a bus by its name. * @name: name of the driver. * @bus: bus to scan for the driver. * * Call kset_find_obj() to iterate over list of drivers on * a bus to find driver by name. Return driver if found. * * This routine provides no locking to prevent the driver it returns * from being unregistered or unloaded while the caller is using it. * The caller is responsible for preventing this. */ struct device_driver *driver_find(const char *name, const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *k; struct driver_private *priv; if (!sp) return NULL; k = kset_find_obj(sp->drivers_kset, name); subsys_put(sp); if (!k) return NULL; priv = to_driver(k); /* Drop reference added by kset_find_obj() */ kobject_put(k); return priv->driver; } EXPORT_SYMBOL_GPL(driver_find); /* * Warning, the value could go to "removed" instantly after calling this function, so be very * careful when calling it... */ bool bus_is_registered(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); bool is_initialized = false; if (sp) { is_initialized = true; subsys_put(sp); } return is_initialized; } /** * bus_get_dev_root - return a pointer to the "device root" of a bus * @bus: bus to return the device root of. * * If a bus has a "device root" structure, return it, WITH THE REFERENCE * COUNT INCREMENTED. * * Note, when finished with the device, a call to put_device() is required. * * If the device root is not present (or bus is not a valid pointer), NULL * will be returned. */ struct device *bus_get_dev_root(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct device *dev_root; if (!sp) return NULL; dev_root = get_device(sp->dev_root); subsys_put(sp); return dev_root; } EXPORT_SYMBOL_GPL(bus_get_dev_root); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; }
351 350 125 174 190 471 26 49 188 67 22 66 297 307 602 601 212 492 31 42 357 357 1 212 492 200 201 200 202 202 201 202 76 33 4 25 6 24 2 42 43 27 6 20 4 43 42 42 31 31 4 60 54 9 7 7 54 171 156 40 12 27 165 165 155 26 165 156 25 299 299 299 299 299 299 293 292 15 1 2 2 1 4 69 69 69 68 52 18 18 59 11 59 69 25 25 298 298 103 54 47 7 54 54 162 162 27 1 6 2 3 2 234 602 106 133 133 46 87 267 186 517 517 519 132 292 174 174 174 12 165 172 174 156 174 156 42 454 168 26 143 504 228 293 292 291 293 293 20 293 293 293 293 293 293 292 293 291 172 124 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp */ #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> #include <net/sock_reuseport.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { static u32 inet_ehash_secret __read_mostly; net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. */ static u32 sk_ehashfn(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return inet6_ehashfn(sock_net(sk), &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); #endif return inet_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); } /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } return tb; } /* * Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev) { return net_eq(ib_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev; } static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk) { write_pnet(&tb2->ib_net, net); tb2->l3mdev = tb->l3mdev; tb2->port = tb->port; #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); if (sk->sk_family == AF_INET6) { tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; } else { tb2->addr_type = IPV6_ADDR_MAPPED; ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); } #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk) { struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb2) inet_bind2_bucket_init(tb2, net, head, tb, sk); return tb2; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); } } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); if (tb2->addr_type != IPV6_ADDR_MAPPED) return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); } /* * Get rid of any references to a local port held by the given sock. */ static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; int bhash; bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); head = &hashinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } spin_unlock(&head2->lock); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } void inet_put_port(struct sock *sk) { local_bh_disable(); __inet_put_port(sk); local_bh_enable(); } EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; struct net *net = sock_net(sk); bool update_fastreuse = false; struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; int bhash, l3mdev; bhash = inet_bhashfn(net, port, table->bhash_size); head = &table->bhash[bhash]; head2 = inet_bhashfn_portaddr(table, child, net, port); spin_lock(&head->lock); spin_lock(&head2->lock); tb = inet_csk(sk)->icsk_bind_hash; tb2 = inet_csk(sk)->icsk_bind2_hash; if (unlikely(!tb || !tb2)) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOENT; } if (tb->port != port) { l3mdev = inet_sk_bound_l3mdev(sk); /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } created_inet_bind_bucket = true; } update_fastreuse = true; goto bhash2_find; } else if (!inet_bind2_bucket_addr_match(tb2, child)) { l3mdev = inet_sk_bound_l3mdev(sk); bhash2_find: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, net, head2, tb, child); if (!tb2) goto error; } } if (update_fastreuse) inet_csk_update_fastreuse(tb, child); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); return 0; error: if (created_inet_bind_bucket) inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); static struct inet_listen_hashbucket * inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); else #endif hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); return inet_lhash2_bucket(h, hash); } static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (sk->sk_family == PF_INET) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ /* called with rcu_read_lock() : No refcount taken on the socket */ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && hashinfo == net->ipv4.tcp_death_row.hashinfo) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); if (result) goto done; } hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with INADDR_ANY */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); /* All sockets share common refcount, but have different destructors */ void sock_gen_put(struct sock *sk) { if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); else if (sk->sk_state == TCP_NEW_SYN_RECV) reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); void sock_edemux(struct sk_buff *skb) { sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet_match(net, sk, acookie, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } /* Searches for an exsiting socket in the ehash bucket list. * Returns true if found, false otherwise. */ static bool inet_ehash_lookup_by_sk(struct sock *sk, struct hlist_nulls_head *list) { const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); const int sdif = sk->sk_bound_dev_if; const int dif = sk->sk_bound_dev_if; const struct hlist_nulls_node *node; struct net *net = sock_net(sk); struct sock *esk; INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); sk_nulls_for_each_rcu(esk, node, list) { if (esk->sk_hash != sk->sk_hash) continue; if (sk->sk_family == AF_INET) { if (unlikely(inet_match(net, esk, acookie, ports, dif, sdif))) { return true; } } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (unlikely(inet6_match(net, esk, &sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr, ports, dif, sdif))) { return true; } } #endif } return false; } /* Insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT) * If an existing socket already exists, socket sk is not inserted, * and sets found_dup_sk parameter to true. */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; bool ret = true; WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); } else if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; } if (ret) __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); return ret; } bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } return ok; } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); return err; } EXPORT_SYMBOL(__inet_hash); int inet_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); if (sk_unhashed(sk)) { spin_unlock_bh(lock); return; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } EXPORT_SYMBOL_GPL(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; return inet_bind2_bucket_addr_match(tb, sk); } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; #if IS_ENABLED(CONFIG_IPV6) if (tb->addr_type == IPV6_ADDR_ANY) return true; if (tb->addr_type != IPV6_ADDR_MAPPED) return false; if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return false; #endif return tb->rcv_saddr == 0; } /* The socket's bhash2 hashbucket spinlock must be held when this is called */ struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { struct inet_bind2_bucket *bhash2 = NULL; inet_bind_bucket_for_each(bhash2, &head->chain) if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) break; return bhash2; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &in6addr_any, port); else #endif hash = ipv4_portaddr_hash(net, 0, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } static void inet_update_saddr(struct sock *sk, void *saddr, int family) { if (family == AF_INET) { inet_sk(sk)->inet_saddr = *(__be32 *)saddr; sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); } #if IS_ENABLED(CONFIG_IPV6) else { sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; } #endif } static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); int bhash; if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); return 0; } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); if (!new_tb2) { if (reset) { /* The (INADDR_ANY, port) bucket might have already * been freed, then we cannot fixup icsk_bind2_hash, * so we give up and unlink sk from bhash/bhash2 not * to leave inconsistency in bhash2. */ inet_put_port(sk); inet_reset_saddr(sk); } return -ENOMEM; } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); /* If we change saddr locklessly, another thread * iterating over bhash might see corrupted address. */ spin_lock_bh(&head->lock); spin_lock(&head2->lock); __sk_del_bind_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); return 0; } int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though * attacks were since demonstrated, thus we use 65536 by default instead * to really give more isolation and privacy, at the expense of 256kB * of kernel memory. */ #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; struct inet_timewait_sock *tw = NULL; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; bool tb_created = false; u32 remaining, offset; int ret, i, low, high; bool local_ports; int step, l3mdev; u32 index; if (port) { local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; } l3mdev = inet_sk_bound_l3mdev(sk); local_ports = inet_sk_get_local_port_range(sk, &low, &high); step = local_ports ? 1 : 2; high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (!local_ports && remaining > 1) remaining &= ~1U; get_random_sleepable_once(table_perturb, INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); offset %= remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ if (!local_ports) offset &= ~1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, port, &tw)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; } tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; next_port: spin_unlock_bh(&head->lock); cond_resched(); } if (!local_ports) { offset++; if ((offset & 1) && remaining > 1) goto other_parity_scan; } return -EADDRNOTAVAIL; ok: /* Find the corresponding tb2 bucket since we need to * add the socket to the bhash2 table as well */ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, tb, sk); if (!tb2) goto error; } /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention * it may be inexistent. */ i = max_t(int, i, get_random_u32_below(8) * step); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head2->lock); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; error: spin_unlock(&head2->lock); if (tb_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); spin_unlock_bh(&head->lock); return -ENOMEM; } /* * Bind a port for a connect operation and hash it. */ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, i + LISTENING_NULLS_BASE); } } void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, scale, 0, NULL, &h->lhash2_mask, low_limit, high_limit); init_hashinfo_lhash2(h); /* this one is used for source ports of outgoing connections */ table_perturb = alloc_large_system_hash("Table-perturb", sizeof(*table_perturb), INET_TABLE_PERTURB_SIZE, 0, 0, NULL, NULL, INET_TABLE_PERTURB_SIZE, INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) { h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); if (!h->lhash2) return -ENOMEM; h->lhash2_mask = INET_LHTABLE_SIZE - 1; /* INET_LHTABLE_SIZE must be a power of 2 */ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); init_hashinfo_lhash2(h); return 0; } EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; if (locksz != 0) { /* allocate 2 cache lines or at least one spinlock per cpu */ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; for (i = 0; i < nblocks; i++) spin_lock_init(&hashinfo->ehash_locks[i]); } hashinfo->ehash_locks_mask = nblocks - 1; return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) { struct inet_hashinfo *new_hashinfo; int i; new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); if (!new_hashinfo) goto err; new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), GFP_KERNEL_ACCOUNT); if (!new_hashinfo->ehash) goto free_hashinfo; new_hashinfo->ehash_mask = ehash_entries - 1; if (inet_ehash_locks_alloc(new_hashinfo)) goto free_ehash; for (i = 0; i < ehash_entries; i++) INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); new_hashinfo->pernet = true; return new_hashinfo; free_ehash: vfree(new_hashinfo->ehash); free_hashinfo: kfree(new_hashinfo); err: return NULL; } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { if (!hashinfo->pernet) return; inet_ehash_locks_free(hashinfo); vfree(hashinfo->ehash); kfree(hashinfo); } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* SPDX-License-Identifier: GPL-2.0 */ /* * Vxlan private header file * */ #ifndef _VXLAN_PRIVATE_H #define _VXLAN_PRIVATE_H #include <linux/rhashtable.h> extern unsigned int vxlan_net_id; extern const u8 all_zeros_mac[ETH_ALEN + 2]; extern const struct rhashtable_params vxlan_vni_rht_params; #define PORT_HASH_BITS 8 #define PORT_HASH_SIZE (1 << PORT_HASH_BITS) /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; struct hlist_head sock_list[PORT_HASH_SIZE]; spinlock_t sock_lock; struct notifier_block nexthop_notifier_block; }; /* Forwarding table entry */ struct vxlan_fdb { struct hlist_node hlist; /* linked list of entries */ struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; u8 eth_addr[ETH_ALEN]; u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) { return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; } /* Socket hash table head */ static inline struct hlist_head *vs_head(struct net *net, __be16 port) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } /* First remote destination for a forwarding entry. * Guaranteed to be non-NULL because remotes are never deleted. */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { if (rcu_access_pointer(fdb->nh)) return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { if (rcu_access_pointer(fdb->nh)) return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) { if (a->sa.sa_family != b->sa.sa_family) return false; if (a->sa.sa_family == AF_INET6) return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); else return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; } static inline int vxlan_nla_get_addr(union vxlan_addr *ip, const struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { ip->sin6.sin6_addr = nla_get_in6_addr(nla); ip->sa.sa_family = AF_INET6; return 0; } else if (nla_len(nla) >= sizeof(__be32)) { ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); ip->sa.sa_family = AF_INET; return 0; } else { return -EAFNOSUPPORT; } } static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr); else return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); } static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return ipv6_addr_is_multicast(&ip->sin6.sin6_addr); else return ipv4_is_multicast(ip->sin.sin_addr.s_addr); } #else /* !CONFIG_IPV6 */ static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) { return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; } static inline int vxlan_nla_get_addr(union vxlan_addr *ip, const struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { return -EAFNOSUPPORT; } else if (nla_len(nla) >= sizeof(__be32)) { ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); ip->sa.sa_family = AF_INET; return 0; } else { return -EAFNOSUPPORT; } } static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const union vxlan_addr *ip) { return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); } static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip) { return ipv4_is_multicast(ip->sin.sin_addr.s_addr); } #endif static inline size_t vxlan_addr_size(const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return sizeof(struct in6_addr); else return sizeof(__be32); } static inline struct vxlan_vni_node * vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_vni_group *vg; vg = rcu_dereference_rtnl(vxlan->vnigrp); if (!vg) return NULL; return rhashtable_lookup_fast(&vg->vni_hash, &vni, vxlan_vni_rht_params); } /* vxlan_core.c */ int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack); int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify); u32 eth_vni_hash(const unsigned char *addr, __be32 vni); u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni); int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack); void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc); int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni); /* vxlan_vnifilter.c */ int vxlan_vnigroup_init(struct vxlan_dev *vxlan); void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan); void vxlan_vnifilter_init(void); void vxlan_vnifilter_uninit(void); void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, struct vxlan_vni_node *vninode, int type, unsigned int len); void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, struct vxlan_sock *vs, bool ipv6); void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan); int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, union vxlan_addr *old_remote_ip, union vxlan_addr *new_remote_ip, struct netlink_ext_ack *extack); /* vxlan_multicast.c */ int vxlan_multicast_join(struct vxlan_dev *vxlan); int vxlan_multicast_leave(struct vxlan_dev *vxlan); bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev, __be32 vni, union vxlan_addr *rip, int rifindex); int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex); int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex); /* vxlan_mdb.c */ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int vxlan_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int vxlan_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); struct vxlan_mdb_entry *vxlan_mdb_entry_skb_get(struct vxlan_dev *vxlan, struct sk_buff *skb, __be32 src_vni); netdev_tx_t vxlan_mdb_xmit(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, struct sk_buff *skb); int vxlan_mdb_init(struct vxlan_dev *vxlan); void vxlan_mdb_fini(struct vxlan_dev *vxlan); #endif
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM thp #if !defined(_TRACE_THP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_THP_H #include <linux/types.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(hugepage_set, TP_PROTO(unsigned long addr, unsigned long pte), TP_ARGS(addr, pte), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, pte) ), TP_fast_assign( __entry->addr = addr; __entry->pte = pte; ), TP_printk("Set page table entry with 0x%lx with 0x%lx", __entry->addr, __entry->pte) ); DEFINE_EVENT(hugepage_set, hugepage_set_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), TP_ARGS(addr, pmd) ); DEFINE_EVENT(hugepage_set, hugepage_set_pud, TP_PROTO(unsigned long addr, unsigned long pud), TP_ARGS(addr, pud) ); DECLARE_EVENT_CLASS(hugepage_update, TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set), TP_ARGS(addr, pte, clr, set), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, pte) __field(unsigned long, clr) __field(unsigned long, set) ), TP_fast_assign( __entry->addr = addr; __entry->pte = pte; __entry->clr = clr; __entry->set = set; ), TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); DEFINE_EVENT(hugepage_update, hugepage_update_pmd, TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, unsigned long set), TP_ARGS(addr, pmd, clr, set) ); DEFINE_EVENT(hugepage_update, hugepage_update_pud, TP_PROTO(unsigned long addr, unsigned long pud, unsigned long clr, unsigned long set), TP_ARGS(addr, pud, clr, set) ); DECLARE_EVENT_CLASS(migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), TP_ARGS(addr, pmd), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, pmd) ), TP_fast_assign( __entry->addr = addr; __entry->pmd = pmd; ), TP_printk("addr=%lx, pmd=%lx", __entry->addr, __entry->pmd) ); DEFINE_EVENT(migration_pmd, set_migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), TP_ARGS(addr, pmd) ); DEFINE_EVENT(migration_pmd, remove_migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), TP_ARGS(addr, pmd) ); #endif /* _TRACE_THP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
783 6 327 334 331 620 24 334 300 449 448 378 337 74 115 31 272 377 336 166 336 166 83 102 5 130 756 1 82 428 819 818 797 823 4 9 702 252 253 253 253 1 252 313 312 1 62 251 13 430 347 99 253 131 488 481 481 78 2 25 45 4 375 474 9 10 6 165 334 491 3 3 488 15 139 356 135 312 77 377 417 1 18 1255 493 798 336 336 334 334 1443 42 1477 1477 42 1477 424 1216 74 6 69 35 10 29 5 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 fs regular file handling primitives * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) */ #include <linux/time.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" /* * Returns %true if the given DIO request should be attempted with DIO, or * %false if it should fall back to buffered I/O. * * DIO isn't well specified; when it's unsupported (either due to the request * being misaligned, or due to the file not supporting DIO at all), filesystems * either fall back to buffered I/O or return EINVAL. For files that don't use * any special features like encryption or verity, ext4 has traditionally * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. * In this case, we should attempt the DIO, *not* fall back to buffered I/O. * * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 * traditionally falls back to buffered I/O. * * This function implements the traditional ext4 behavior in all these cases. */ static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); u32 dio_align = ext4_dio_alignment(inode); if (dio_align == 0) return false; if (dio_align == 1) return true; return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore */ if (!IS_DAX(inode)) { inode_unlock_shared(inode); /* Fallback to buffered IO in case we cannot support DAX */ return generic_file_read_iter(iocb, to); } ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #endif static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ static int ext4_release_file(struct inode *inode, struct file *filp) { if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static bool ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) return true; return false; } static bool ext4_extending_io(struct inode *inode, loff_t offset, size_t len) { if (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize) return true; return false; } /* Is IO overwriting allocated or initialized blocks? */ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, blklen; if (pos + len > i_size_read(inode)) return false; map.m_lblk = pos >> blkbits; map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err != blklen) return false; /* * 'err==len' means that all of the blocks have been preallocated, * regardless of whether they have been initialized or not. We need to * check m_flags to distinguish the unwritten extents. */ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } return iov_iter_count(from); } static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); if (count <= 0) return count; ret = file_modified(iocb->ki_filp); if (ret) return ret; return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = generic_perform_write(iocb, from); out: inode_unlock(inode); if (unlikely(ret <= 0)) return ret; return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t count) { handle_t *handle; lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + count)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); return ret; } } if (inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); return count; } /* * Clean up the inode after DIO or DAX extending write has completed and the * inode size has been updated using ext4_handle_inode_extension(). */ static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) { lockdep_assert_held_write(&inode->i_rwsem); if (count < 0) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); return; } /* * If i_disksize got extended either due to writeback of delalloc * blocks or extending truncate while the DIO was running we could fail * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* * The write has successfully completed. Not much to * do with the error here so just cleanup the orphan * list and hope for the best. */ ext4_orphan_del(NULL, inode); return; } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. Also we can race with truncate or write * expanding the file so we have to be a bit careful here. */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return size; return ext4_handle_inode_extension(inode, pos, size); } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the * whole operation by releasing the shared lock and acquiring exclusive lock. * * - For unaligned_io we never take shared lock as it may cause data corruption * when two unaligned IO tries to modify the same block e.g. while zeroing. * * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including * initialized blocks and unwritten blocks. For overwrite unwritten blocks * we protect splitting extents by i_data_sem in ext4_inode_info, so we can * also release exclusive i_rwsem lock. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = ret; unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); overwrite = ext4_overwrite_io(inode, offset, count, unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). * * Note that unaligned writes are allowed under shared lock so long as * they are pure overwrites. Otherwise, concurrent unaligned writes risk * data corruption due to partial block zeroing in the dio layer, and so * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); goto restart; } /* * Now that locking is settled, determine dio flags and exclusivity * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce * behavior already. The inode lock is already held exclusive if the * write is non-overwrite or extending, so drain all outstanding dio and * set the force wait dio flag. */ if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } if (unaligned_io && (!overwrite || *unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } ret = file_modified(file); if (ret < 0) goto out; return count; out: if (*ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ret; } static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unwritten = false; bool ilock_shared = true; int dio_flags = 0; /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. */ if (offset + count > i_size_read(inode)) ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { if (ilock_shared) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { if (!inode_trylock(inode)) return -EAGAIN; } } else { if (ilock_shared) inode_lock_shared(inode); else inode_lock(inode); } /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } /* * Prevent inline data from being created since we are going to allocate * blocks for DIO. We know the inode does not currently have inline data * because ext4_should_use_dio() checked for it, but we have to clear * the state flag before the write checks because a lock cycle could * introduce races with other writers. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; offset = iocb->ki_pos; count = ret; if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } ext4_journal_stop(handle); } if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; if (extend) { /* * We always perform extending DIO write synchronously so by * now the IO is completed and ext4_handle_inode_extension() * was called. Cleanup the inode in case of error or race with * writeback of delalloc blocks. */ WARN_ON_ONCE(ret == -EIOCBQUEUED); ext4_inode_extension_cleanup(inode, ret); } out: if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret); ext4_inode_extension_cleanup(inode, ret); } out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; /* * We have to distinguish real writes from writes which will result in a * COW page; COW writes should *not* poke the journal (the file will not * be changed). Doing so would cause unintended failures when mounted * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; pfn_t pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); if ((result & VM_FAULT_ERROR) && error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { filemap_invalidate_unlock_shared(mapping); } return result; } static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } return 0; } static int ext4_sample_last_mounted(struct super_block *sb, struct vfsmount *mnt) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; handle_t *handle; int err; if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience * when trying to sort through large numbers of block * devices or filesystem images. */ memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); err = 0; if (IS_ERR(cp)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); err = PTR_ERR(handle); if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); strncpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: sb_end_intwrite(sb); return err; } static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) return ret; ret = fscrypt_file_open(inode, filp); if (ret) return ret; ret = fsverity_file_open(inode, filp); if (ret) return ret; /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_DIO_PARALLEL_WRITE; return dquot_file_open(inode, filp); } /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; else maxbytes = inode->i_sb->s_maxbytes; switch (whence) { default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, };
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux I2C core ACPI support code * * Copyright (C) 2014 Intel Corp, Author: Lan Tianyu <tianyu.lan@intel.com> */ #include <linux/acpi.h> #include <linux/device.h> #include <linux/err.h> #include <linux/i2c.h> #include <linux/list.h> #include <linux/module.h> #include <linux/slab.h> #include "i2c-core.h" struct i2c_acpi_handler_data { struct acpi_connection_info info; struct i2c_adapter *adapter; }; struct gsb_buffer { u8 status; u8 len; union { u16 wdata; u8 bdata; DECLARE_FLEX_ARRAY(u8, data); }; } __packed; struct i2c_acpi_lookup { struct i2c_board_info *info; acpi_handle adapter_handle; acpi_handle device_handle; acpi_handle search_handle; int n; int index; u32 speed; u32 min_speed; u32 force_speed; }; /** * i2c_acpi_get_i2c_resource - Gets I2cSerialBus resource if type matches * @ares: ACPI resource * @i2c: Pointer to I2cSerialBus resource will be returned here * * Checks if the given ACPI resource is of type I2cSerialBus. * In this case, returns a pointer to it to the caller. * * Returns true if resource type is of I2cSerialBus, otherwise false. */ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c) { struct acpi_resource_i2c_serialbus *sb; if (ares->type != ACPI_RESOURCE_TYPE_SERIAL_BUS) return false; sb = &ares->data.i2c_serial_bus; if (sb->type != ACPI_RESOURCE_SERIAL_TYPE_I2C) return false; *i2c = sb; return true; } EXPORT_SYMBOL_GPL(i2c_acpi_get_i2c_resource); static int i2c_acpi_resource_count(struct acpi_resource *ares, void *data) { struct acpi_resource_i2c_serialbus *sb; int *count = data; if (i2c_acpi_get_i2c_resource(ares, &sb)) *count = *count + 1; return 1; } /** * i2c_acpi_client_count - Count the number of I2cSerialBus resources * @adev: ACPI device * * Returns the number of I2cSerialBus resources in the ACPI-device's * resource-list; or a negative error code. */ int i2c_acpi_client_count(struct acpi_device *adev) { int ret, count = 0; LIST_HEAD(r); ret = acpi_dev_get_resources(adev, &r, i2c_acpi_resource_count, &count); if (ret < 0) return ret; acpi_dev_free_resource_list(&r); return count; } EXPORT_SYMBOL_GPL(i2c_acpi_client_count); static int i2c_acpi_fill_info(struct acpi_resource *ares, void *data) { struct i2c_acpi_lookup *lookup = data; struct i2c_board_info *info = lookup->info; struct acpi_resource_i2c_serialbus *sb; acpi_status status; if (info->addr || !i2c_acpi_get_i2c_resource(ares, &sb)) return 1; if (lookup->index != -1 && lookup->n++ != lookup->index) return 1; status = acpi_get_handle(lookup->device_handle, sb->resource_source.string_ptr, &lookup->adapter_handle); if (ACPI_FAILURE(status)) return 1; info->addr = sb->slave_address; lookup->speed = sb->connection_speed; if (sb->access_mode == ACPI_I2C_10BIT_MODE) info->flags |= I2C_CLIENT_TEN; return 1; } static const struct acpi_device_id i2c_acpi_ignored_device_ids[] = { /* * ACPI video acpi_devices, which are handled by the acpi-video driver * sometimes contain a SERIAL_TYPE_I2C ACPI resource, ignore these. */ { ACPI_VIDEO_HID, 0 }, {} }; struct i2c_acpi_irq_context { int irq; bool wake_capable; }; static int i2c_acpi_do_lookup(struct acpi_device *adev, struct i2c_acpi_lookup *lookup) { struct i2c_board_info *info = lookup->info; struct list_head resource_list; int ret; if (acpi_bus_get_status(adev)) return -EINVAL; if (!acpi_dev_ready_for_enumeration(adev)) return -ENODEV; if (acpi_match_device_ids(adev, i2c_acpi_ignored_device_ids) == 0) return -ENODEV; memset(info, 0, sizeof(*info)); lookup->device_handle = acpi_device_handle(adev); /* Look up for I2cSerialBus resource */ INIT_LIST_HEAD(&resource_list); ret = acpi_dev_get_resources(adev, &resource_list, i2c_acpi_fill_info, lookup); acpi_dev_free_resource_list(&resource_list); if (ret < 0 || !info->addr) return -EINVAL; return 0; } static int i2c_acpi_add_irq_resource(struct acpi_resource *ares, void *data) { struct i2c_acpi_irq_context *irq_ctx = data; struct resource r; if (irq_ctx->irq > 0) return 1; if (!acpi_dev_resource_interrupt(ares, 0, &r)) return 1; irq_ctx->irq = i2c_dev_irq_from_resources(&r, 1); irq_ctx->wake_capable = r.flags & IORESOURCE_IRQ_WAKECAPABLE; return 1; /* No need to add resource to the list */ } /** * i2c_acpi_get_irq - get device IRQ number from ACPI * @client: Pointer to the I2C client device * @wake_capable: Set to true if the IRQ is wake capable * * Find the IRQ number used by a specific client device. * * Return: The IRQ number or an error code. */ int i2c_acpi_get_irq(struct i2c_client *client, bool *wake_capable) { struct acpi_device *adev = ACPI_COMPANION(&client->dev); struct list_head resource_list; struct i2c_acpi_irq_context irq_ctx = { .irq = -ENOENT, }; int ret; INIT_LIST_HEAD(&resource_list); ret = acpi_dev_get_resources(adev, &resource_list, i2c_acpi_add_irq_resource, &irq_ctx); if (ret < 0) return ret; acpi_dev_free_resource_list(&resource_list); if (irq_ctx.irq == -ENOENT) irq_ctx.irq = acpi_dev_gpio_irq_wake_get(adev, 0, &irq_ctx.wake_capable); if (irq_ctx.irq < 0) return irq_ctx.irq; if (wake_capable) *wake_capable = irq_ctx.wake_capable; return irq_ctx.irq; } static int i2c_acpi_get_info(struct acpi_device *adev, struct i2c_board_info *info, struct i2c_adapter *adapter, acpi_handle *adapter_handle) { struct i2c_acpi_lookup lookup; int ret; memset(&lookup, 0, sizeof(lookup)); lookup.info = info; lookup.index = -1; if (acpi_device_enumerated(adev)) return -EINVAL; ret = i2c_acpi_do_lookup(adev, &lookup); if (ret) return ret; if (adapter) { /* The adapter must match the one in I2cSerialBus() connector */ if (ACPI_HANDLE(&adapter->dev) != lookup.adapter_handle) return -ENODEV; } else { struct acpi_device *adapter_adev; /* The adapter must be present */ adapter_adev = acpi_fetch_acpi_dev(lookup.adapter_handle); if (!adapter_adev) return -ENODEV; if (acpi_bus_get_status(adapter_adev) || !adapter_adev->status.present) return -ENODEV; } info->fwnode = acpi_fwnode_handle(adev); if (adapter_handle) *adapter_handle = lookup.adapter_handle; acpi_set_modalias(adev, dev_name(&adev->dev), info->type, sizeof(info->type)); return 0; } static void i2c_acpi_register_device(struct i2c_adapter *adapter, struct acpi_device *adev, struct i2c_board_info *info) { /* * Skip registration on boards where the ACPI tables are * known to contain bogus I2C devices. */ if (acpi_quirk_skip_i2c_client_enumeration(adev)) return; adev->power.flags.ignore_parent = true; acpi_device_set_enumerated(adev); if (IS_ERR(i2c_new_client_device(adapter, info))) adev->power.flags.ignore_parent = false; } static acpi_status i2c_acpi_add_device(acpi_handle handle, u32 level, void *data, void **return_value) { struct i2c_adapter *adapter = data; struct acpi_device *adev = acpi_fetch_acpi_dev(handle); struct i2c_board_info info; if (!adev || i2c_acpi_get_info(adev, &info, adapter, NULL)) return AE_OK; i2c_acpi_register_device(adapter, adev, &info); return AE_OK; } #define I2C_ACPI_MAX_SCAN_DEPTH 32 /** * i2c_acpi_register_devices - enumerate I2C slave devices behind adapter * @adap: pointer to adapter * * Enumerate all I2C slave devices behind this adapter by walking the ACPI * namespace. When a device is found it will be added to the Linux device * model and bound to the corresponding ACPI handle. */ void i2c_acpi_register_devices(struct i2c_adapter *adap) { struct acpi_device *adev; acpi_status status; if (!has_acpi_companion(&adap->dev)) return; status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, I2C_ACPI_MAX_SCAN_DEPTH, i2c_acpi_add_device, NULL, adap, NULL); if (ACPI_FAILURE(status)) dev_warn(&adap->dev, "failed to enumerate I2C slaves\n"); if (!adap->dev.parent) return; adev = ACPI_COMPANION(adap->dev.parent); if (!adev) return; acpi_dev_clear_dependencies(adev); } static const struct acpi_device_id i2c_acpi_force_400khz_device_ids[] = { /* * These Silead touchscreen controllers only work at 400KHz, for * some reason they do not work at 100KHz. On some devices the ACPI * tables list another device at their bus as only being capable * of 100KHz, testing has shown that these other devices work fine * at 400KHz (as can be expected of any recent i2c hw) so we force * the speed of the bus to 400 KHz if a Silead device is present. */ { "MSSL1680", 0 }, {} }; static acpi_status i2c_acpi_lookup_speed(acpi_handle handle, u32 level, void *data, void **return_value) { struct i2c_acpi_lookup *lookup = data; struct acpi_device *adev = acpi_fetch_acpi_dev(handle); if (!adev || i2c_acpi_do_lookup(adev, lookup)) return AE_OK; if (lookup->search_handle != lookup->adapter_handle) return AE_OK; if (lookup->speed <= lookup->min_speed) lookup->min_speed = lookup->speed; if (acpi_match_device_ids(adev, i2c_acpi_force_400khz_device_ids) == 0) lookup->force_speed = I2C_MAX_FAST_MODE_FREQ; return AE_OK; } /** * i2c_acpi_find_bus_speed - find I2C bus speed from ACPI * @dev: The device owning the bus * * Find the I2C bus speed by walking the ACPI namespace for all I2C slaves * devices connected to this bus and use the speed of slowest device. * * Returns the speed in Hz or zero */ u32 i2c_acpi_find_bus_speed(struct device *dev) { struct i2c_acpi_lookup lookup; struct i2c_board_info dummy; acpi_status status; if (!has_acpi_companion(dev)) return 0; memset(&lookup, 0, sizeof(lookup)); lookup.search_handle = ACPI_HANDLE(dev); lookup.min_speed = UINT_MAX; lookup.info = &dummy; lookup.index = -1; status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, I2C_ACPI_MAX_SCAN_DEPTH, i2c_acpi_lookup_speed, NULL, &lookup, NULL); if (ACPI_FAILURE(status)) { dev_warn(dev, "unable to find I2C bus speed from ACPI\n"); return 0; } if (lookup.force_speed) { if (lookup.force_speed != lookup.min_speed) dev_warn(dev, FW_BUG "DSDT uses known not-working I2C bus speed %d, forcing it to %d\n", lookup.min_speed, lookup.force_speed); return lookup.force_speed; } else if (lookup.min_speed != UINT_MAX) { return lookup.min_speed; } else { return 0; } } EXPORT_SYMBOL_GPL(i2c_acpi_find_bus_speed); struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) { struct i2c_adapter *adapter; struct device *dev; dev = bus_find_device(&i2c_bus_type, NULL, handle, device_match_acpi_handle); if (!dev) return NULL; adapter = i2c_verify_adapter(dev); if (!adapter) put_device(dev); return adapter; } EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle); static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev) { return i2c_find_device_by_fwnode(acpi_fwnode_handle(adev)); } static int i2c_acpi_notify(struct notifier_block *nb, unsigned long value, void *arg) { struct acpi_device *adev = arg; struct i2c_board_info info; acpi_handle adapter_handle; struct i2c_adapter *adapter; struct i2c_client *client; switch (value) { case ACPI_RECONFIG_DEVICE_ADD: if (i2c_acpi_get_info(adev, &info, NULL, &adapter_handle)) break; adapter = i2c_acpi_find_adapter_by_handle(adapter_handle); if (!adapter) break; i2c_acpi_register_device(adapter, adev, &info); put_device(&adapter->dev); break; case ACPI_RECONFIG_DEVICE_REMOVE: if (!acpi_device_enumerated(adev)) break; client = i2c_acpi_find_client_by_adev(adev); if (!client) break; i2c_unregister_device(client); put_device(&client->dev); break; } return NOTIFY_OK; } struct notifier_block i2c_acpi_notifier = { .notifier_call = i2c_acpi_notify, }; /** * i2c_acpi_new_device_by_fwnode - Create i2c-client for the Nth I2cSerialBus resource * @fwnode: fwnode with the ACPI resources to get the client from * @index: Index of ACPI resource to get * @info: describes the I2C device; note this is modified (addr gets set) * Context: can sleep * * By default the i2c subsys creates an i2c-client for the first I2cSerialBus * resource of an acpi_device, but some acpi_devices have multiple I2cSerialBus * resources, in that case this function can be used to create an i2c-client * for other I2cSerialBus resources in the Current Resource Settings table. * * Also see i2c_new_client_device, which this function calls to create the * i2c-client. * * Returns a pointer to the new i2c-client, or error pointer in case of failure. * Specifically, -EPROBE_DEFER is returned if the adapter is not found. */ struct i2c_client *i2c_acpi_new_device_by_fwnode(struct fwnode_handle *fwnode, int index, struct i2c_board_info *info) { struct i2c_acpi_lookup lookup; struct i2c_adapter *adapter; struct acpi_device *adev; LIST_HEAD(resource_list); int ret; adev = to_acpi_device_node(fwnode); if (!adev) return ERR_PTR(-ENODEV); memset(&lookup, 0, sizeof(lookup)); lookup.info = info; lookup.device_handle = acpi_device_handle(adev); lookup.index = index; ret = acpi_dev_get_resources(adev, &resource_list, i2c_acpi_fill_info, &lookup); if (ret < 0) return ERR_PTR(ret); acpi_dev_free_resource_list(&resource_list); if (!info->addr) return ERR_PTR(-EADDRNOTAVAIL); adapter = i2c_acpi_find_adapter_by_handle(lookup.adapter_handle); if (!adapter) return ERR_PTR(-EPROBE_DEFER); return i2c_new_client_device(adapter, info); } EXPORT_SYMBOL_GPL(i2c_acpi_new_device_by_fwnode); bool i2c_acpi_waive_d0_probe(struct device *dev) { struct i2c_driver *driver = to_i2c_driver(dev->driver); struct acpi_device *adev = ACPI_COMPANION(dev); return driver->flags & I2C_DRV_ACPI_WAIVE_D0_PROBE && adev && adev->power.state_for_enumeration >= adev->power.state; } EXPORT_SYMBOL_GPL(i2c_acpi_waive_d0_probe); #ifdef CONFIG_ACPI_I2C_OPREGION static int acpi_gsb_i2c_read_bytes(struct i2c_client *client, u8 cmd, u8 *data, u8 data_len) { struct i2c_msg msgs[2]; int ret; u8 *buffer; buffer = kzalloc(data_len, GFP_KERNEL); if (!buffer) return AE_NO_MEMORY; msgs[0].addr = client->addr; msgs[0].flags = client->flags; msgs[0].len = 1; msgs[0].buf = &cmd; msgs[1].addr = client->addr; msgs[1].flags = client->flags | I2C_M_RD; msgs[1].len = data_len; msgs[1].buf = buffer; ret = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); if (ret < 0) { /* Getting a NACK is unfortunately normal with some DSTDs */ if (ret == -EREMOTEIO) dev_dbg(&client->adapter->dev, "i2c read %d bytes from client@%#x starting at reg %#x failed, error: %d\n", data_len, client->addr, cmd, ret); else dev_err(&client->adapter->dev, "i2c read %d bytes from client@%#x starting at reg %#x failed, error: %d\n", data_len, client->addr, cmd, ret); /* 2 transfers must have completed successfully */ } else if (ret == 2) { memcpy(data, buffer, data_len); ret = 0; } else { ret = -EIO; } kfree(buffer); return ret; } static int acpi_gsb_i2c_write_bytes(struct i2c_client *client, u8 cmd, u8 *data, u8 data_len) { struct i2c_msg msgs[1]; u8 *buffer; int ret = AE_OK; buffer = kzalloc(data_len + 1, GFP_KERNEL); if (!buffer) return AE_NO_MEMORY; buffer[0] = cmd; memcpy(buffer + 1, data, data_len); msgs[0].addr = client->addr; msgs[0].flags = client->flags; msgs[0].len = data_len + 1; msgs[0].buf = buffer; ret = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); kfree(buffer); if (ret < 0) { dev_err(&client->adapter->dev, "i2c write failed: %d\n", ret); return ret; } /* 1 transfer must have completed successfully */ return (ret == 1) ? 0 : -EIO; } static acpi_status i2c_acpi_space_handler(u32 function, acpi_physical_address command, u32 bits, u64 *value64, void *handler_context, void *region_context) { struct gsb_buffer *gsb = (struct gsb_buffer *)value64; struct i2c_acpi_handler_data *data = handler_context; struct acpi_connection_info *info = &data->info; struct acpi_resource_i2c_serialbus *sb; struct i2c_adapter *adapter = data->adapter; struct i2c_client *client; struct acpi_resource *ares; u32 accessor_type = function >> 16; u8 action = function & ACPI_IO_MASK; acpi_status ret; int status; ret = acpi_buffer_to_resource(info->connection, info->length, &ares); if (ACPI_FAILURE(ret)) return ret; client = kzalloc(sizeof(*client), GFP_KERNEL); if (!client) { ret = AE_NO_MEMORY; goto err; } if (!value64 || !i2c_acpi_get_i2c_resource(ares, &sb)) { ret = AE_BAD_PARAMETER; goto err; } client->adapter = adapter; client->addr = sb->slave_address; if (sb->access_mode == ACPI_I2C_10BIT_MODE) client->flags |= I2C_CLIENT_TEN; switch (accessor_type) { case ACPI_GSB_ACCESS_ATTRIB_SEND_RCV: if (action == ACPI_READ) { status = i2c_smbus_read_byte(client); if (status >= 0) { gsb->bdata = status; status = 0; } } else { status = i2c_smbus_write_byte(client, gsb->bdata); } break; case ACPI_GSB_ACCESS_ATTRIB_BYTE: if (action == ACPI_READ) { status = i2c_smbus_read_byte_data(client, command); if (status >= 0) { gsb->bdata = status; status = 0; } } else { status = i2c_smbus_write_byte_data(client, command, gsb->bdata); } break; case ACPI_GSB_ACCESS_ATTRIB_WORD: if (action == ACPI_READ) { status = i2c_smbus_read_word_data(client, command); if (status >= 0) { gsb->wdata = status; status = 0; } } else { status = i2c_smbus_write_word_data(client, command, gsb->wdata); } break; case ACPI_GSB_ACCESS_ATTRIB_BLOCK: if (action == ACPI_READ) { status = i2c_smbus_read_block_data(client, command, gsb->data); if (status >= 0) { gsb->len = status; status = 0; } } else { status = i2c_smbus_write_block_data(client, command, gsb->len, gsb->data); } break; case ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE: if (action == ACPI_READ) { status = acpi_gsb_i2c_read_bytes(client, command, gsb->data, info->access_length); } else { status = acpi_gsb_i2c_write_bytes(client, command, gsb->data, info->access_length); } break; default: dev_warn(&adapter->dev, "protocol 0x%02x not supported for client 0x%02x\n", accessor_type, client->addr); ret = AE_BAD_PARAMETER; goto err; } gsb->status = status; err: kfree(client); ACPI_FREE(ares); return ret; } int i2c_acpi_install_space_handler(struct i2c_adapter *adapter) { acpi_handle handle; struct i2c_acpi_handler_data *data; acpi_status status; if (!adapter->dev.parent) return -ENODEV; handle = ACPI_HANDLE(adapter->dev.parent); if (!handle) return -ENODEV; data = kzalloc(sizeof(struct i2c_acpi_handler_data), GFP_KERNEL); if (!data) return -ENOMEM; data->adapter = adapter; status = acpi_bus_attach_private_data(handle, (void *)data); if (ACPI_FAILURE(status)) { kfree(data); return -ENOMEM; } status = acpi_install_address_space_handler(handle, ACPI_ADR_SPACE_GSBUS, &i2c_acpi_space_handler, NULL, data); if (ACPI_FAILURE(status)) { dev_err(&adapter->dev, "Error installing i2c space handler\n"); acpi_bus_detach_private_data(handle); kfree(data); return -ENOMEM; } return 0; } void i2c_acpi_remove_space_handler(struct i2c_adapter *adapter) { acpi_handle handle; struct i2c_acpi_handler_data *data; acpi_status status; if (!adapter->dev.parent) return; handle = ACPI_HANDLE(adapter->dev.parent); if (!handle) return; acpi_remove_address_space_handler(handle, ACPI_ADR_SPACE_GSBUS, &i2c_acpi_space_handler); status = acpi_bus_get_private_data(handle, (void **)&data); if (ACPI_SUCCESS(status)) kfree(data); acpi_bus_detach_private_data(handle); } #endif /* CONFIG_ACPI_I2C_OPREGION */
4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) */ #include <linux/types.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/timer.h> #include <net/ax25.h> #include <linux/skbuff.h> #include <net/netrom.h> #include <linux/init.h> static void nr_loopback_timer(struct timer_list *); static struct sk_buff_head loopback_queue; static DEFINE_TIMER(loopback_timer, nr_loopback_timer); void __init nr_loopback_init(void) { skb_queue_head_init(&loopback_queue); } static inline int nr_loopback_running(void) { return timer_pending(&loopback_timer); } int nr_loopback_queue(struct sk_buff *skb) { struct sk_buff *skbn; if ((skbn = alloc_skb(skb->len, GFP_ATOMIC)) != NULL) { skb_copy_from_linear_data(skb, skb_put(skbn, skb->len), skb->len); skb_reset_transport_header(skbn); skb_queue_tail(&loopback_queue, skbn); if (!nr_loopback_running()) mod_timer(&loopback_timer, jiffies + 10); } kfree_skb(skb); return 1; } static void nr_loopback_timer(struct timer_list *unused) { struct sk_buff *skb; ax25_address *nr_dest; struct net_device *dev; if ((skb = skb_dequeue(&loopback_queue)) != NULL) { nr_dest = (ax25_address *)(skb->data + 7); dev = nr_dev_get(nr_dest); if (dev == NULL || nr_rx_frame(skb, dev) == 0) kfree_skb(skb); dev_put(dev); if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running()) mod_timer(&loopback_timer, jiffies + 10); } } void nr_loopback_clear(void) { del_timer_sync(&loopback_timer); skb_queue_purge(&loopback_queue); }
2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/dir.c * BFS directory operations. * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com> * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005 */ #include <linux/time.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include "bfs.h" #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino); static struct buffer_head *bfs_find_entry(struct inode *dir, const struct qstr *child, struct bfs_dirent **res_dir); static int bfs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct buffer_head *bh; struct bfs_dirent *de; unsigned int offset; int block; if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { printf("Bad f_pos=%08lx for %s:%08lx\n", (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); return -EINVAL; } while (ctx->pos < dir->i_size) { offset = ctx->pos & (BFS_BSIZE - 1); block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS); bh = sb_bread(dir->i_sb, block); if (!bh) { ctx->pos += BFS_BSIZE - offset; continue; } do { de = (struct bfs_dirent *)(bh->b_data + offset); if (de->ino) { int size = strnlen(de->name, BFS_NAMELEN); if (!dir_emit(ctx, de->name, size, le16_to_cpu(de->ino), DT_UNKNOWN)) { brelse(bh); return 0; } } offset += BFS_DIRENT_SIZE; ctx->pos += BFS_DIRENT_SIZE; } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size)); brelse(bh); } return 0; } const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .iterate_shared = bfs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int err; struct inode *inode; struct super_block *s = dir->i_sb; struct bfs_sb_info *info = BFS_SB(s); unsigned long ino; inode = new_inode(s); if (!inode) return -ENOMEM; mutex_lock(&info->bfs_lock); ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1); if (ino > info->si_lasti) { mutex_unlock(&info->bfs_lock); iput(inode); return -ENOSPC; } set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; inode->i_ino = ino; BFS_I(inode)->i_dsk_ino = ino; BFS_I(inode)->i_sblock = 0; BFS_I(inode)->i_eblock = 0; insert_inode_hash(inode); mark_inode_dirty(inode); bfs_dump_imap("create", s); err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino); if (err) { inode_dec_link_count(inode); mutex_unlock(&info->bfs_lock); iput(inode); return err; } mutex_unlock(&info->bfs_lock); d_instantiate(dentry, inode); return 0; } static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(dir->i_sb); if (dentry->d_name.len > BFS_NAMELEN) return ERR_PTR(-ENAMETOOLONG); mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, &dentry->d_name, &de); if (bh) { unsigned long ino = (unsigned long)le16_to_cpu(de->ino); brelse(bh); inode = bfs_iget(dir->i_sb, ino); } mutex_unlock(&info->bfs_lock); return d_splice_alias(inode, dentry); } static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { struct inode *inode = d_inode(old); struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; mutex_lock(&info->bfs_lock); err = bfs_add_entry(dir, &new->d_name, inode->i_ino); if (err) { mutex_unlock(&info->bfs_lock); return err; } inc_nlink(inode); inode_set_ctime_current(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); mutex_unlock(&info->bfs_lock); return 0; } static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; struct inode *inode = d_inode(dentry); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(inode->i_sb); mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, &dentry->d_name, &de); if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) goto out_brelse; if (!inode->i_nlink) { printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); error = 0; out_brelse: brelse(bh); mutex_unlock(&info->bfs_lock); return error; } static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; struct bfs_dirent *old_de, *new_de; struct bfs_sb_info *info; int error = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_bh = new_bh = NULL; old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) return -EINVAL; info = BFS_SB(old_inode->i_sb); mutex_lock(&info->bfs_lock); old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de); if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) goto end_rename; error = -EPERM; new_inode = d_inode(new_dentry); new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh && !new_inode) { brelse(new_bh); new_bh = NULL; } if (!new_bh) { error = bfs_add_entry(new_dir, &new_dentry->d_name, old_inode->i_ino); if (error) goto end_rename; } old_de->ino = 0; inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); if (new_inode) { inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); error = 0; end_rename: mutex_unlock(&info->bfs_lock); brelse(old_bh); brelse(new_bh); return error; } const struct inode_operations bfs_dir_inops = { .create = bfs_create, .lookup = bfs_lookup, .link = bfs_link, .unlink = bfs_unlink, .rename = bfs_rename, }; static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) { const unsigned char *name = child->name; int namelen = child->len; struct buffer_head *bh; struct bfs_dirent *de; int block, sblock, eblock, off, pos; int i; dprintf("name=%s, namelen=%d\n", name, namelen); sblock = BFS_I(dir)->i_sblock; eblock = BFS_I(dir)->i_eblock; for (block = sblock; block <= eblock; block++) { bh = sb_bread(dir->i_sb, block); if (!bh) return -EIO; for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) { de = (struct bfs_dirent *)(bh->b_data + off); if (!de->ino) { pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; inode_set_ctime_current(dir); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) de->name[i] = (i < namelen) ? name[i] : 0; mark_buffer_dirty_inode(bh, dir); brelse(bh); return 0; } } brelse(bh); } return -ENOSPC; } static inline int bfs_namecmp(int len, const unsigned char *name, const char *buffer) { if ((len < BFS_NAMELEN) && buffer[len]) return 0; return !memcmp(name, buffer, len); } static struct buffer_head *bfs_find_entry(struct inode *dir, const struct qstr *child, struct bfs_dirent **res_dir) { unsigned long block = 0, offset = 0; struct buffer_head *bh = NULL; struct bfs_dirent *de; const unsigned char *name = child->name; int namelen = child->len; *res_dir = NULL; if (namelen > BFS_NAMELEN) return NULL; while (block * BFS_BSIZE + offset < dir->i_size) { if (!bh) { bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block); if (!bh) { block++; continue; } } de = (struct bfs_dirent *)(bh->b_data + offset); offset += BFS_DIRENT_SIZE; if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) { *res_dir = de; return bh; } if (offset < bh->b_size) continue; brelse(bh); bh = NULL; offset = 0; block++; } brelse(bh); return NULL; }
1 1 1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/export.h> #include "rds.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ static const char *const rds_stat_names[] = { "conn_reset", "recv_drop_bad_checksum", "recv_drop_old_seq", "recv_drop_no_sock", "recv_drop_dead_sock", "recv_deliver_raced", "recv_delivered", "recv_queued", "recv_immediate_retry", "recv_delayed_retry", "recv_ack_required", "recv_rdma_bytes", "recv_ping", "send_queue_empty", "send_queue_full", "send_lock_contention", "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", "send_ack_required", "send_queued", "send_rdma", "send_rdma_bytes", "send_pong", "page_remainder_hit", "page_remainder_miss", "copy_to_user", "copy_from_user", "cong_update_queued", "cong_update_received", "cong_send_error", "cong_send_blocked", "recv_bytes_added_to_sock", "recv_bytes_freed_fromsock", "send_stuck_rm", }; void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr) { struct rds_info_counter ctr; size_t i; for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); ctr.name[sizeof(ctr.name) - 1] = '\0'; ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); } } EXPORT_SYMBOL_GPL(rds_stats_info_copy); /* * This gives global counters across all the transports. The strings * are copied in so that the tool doesn't need knowledge of the specific * stats that we're exporting. Some are pretty implementation dependent * and may change over time. That doesn't stop them from being useful. * * This is the only function in the chain that knows about the byte granular * length in userspace. It converts it to number of stat entries that the * rest of the functions operate in. */ static void rds_stats_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; unsigned int avail; avail = len / sizeof(struct rds_info_counter); if (avail < ARRAY_SIZE(rds_stat_names)) { avail = 0; goto trans; } for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, ARRAY_SIZE(rds_stat_names)); avail -= ARRAY_SIZE(rds_stat_names); trans: lens->each = sizeof(struct rds_info_counter); lens->nr = rds_trans_stats_info_copy(iter, avail) + ARRAY_SIZE(rds_stat_names); } void rds_stats_exit(void) { rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; }
16 11 11 5 1 1 1 1 1 1 1 7 11 5 21 4 4 2 1 1 1 1 2 1 2 4 4 3 3 4 4 3 2 5 5 4 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #include <net/sock.h> #include <trace/events/devlink.h> #include "devl_internal.h" struct devlink_fmsg_item { struct list_head list; int attrtype; u8 nla_type; u16 len; int value[]; }; struct devlink_fmsg { struct list_head item_list; int err; /* first error encountered on some devlink_fmsg_XXX() call */ bool putting_binary; /* This flag forces enclosing of binary data * in an array brackets. It forces using * of designated API: * devlink_fmsg_binary_pair_nest_start() * devlink_fmsg_binary_pair_nest_end() */ }; static struct devlink_fmsg *devlink_fmsg_alloc(void) { struct devlink_fmsg *fmsg; fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); if (!fmsg) return NULL; INIT_LIST_HEAD(&fmsg->item_list); return fmsg; } static void devlink_fmsg_free(struct devlink_fmsg *fmsg) { struct devlink_fmsg_item *item, *tmp; list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { list_del(&item->list); kfree(item); } kfree(fmsg); } struct devlink_health_reporter { struct list_head list; void *priv; const struct devlink_health_reporter_ops *ops; struct devlink *devlink; struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; bool auto_recover; bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; u64 error_count; u64 recovery_count; u64 last_recovery_ts; }; void * devlink_health_reporter_priv(struct devlink_health_reporter *reporter) { return reporter->priv; } EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * __devlink_health_reporter_find_by_name(struct list_head *reporter_list, const char *reporter_name) { struct devlink_health_reporter *reporter; list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; return NULL; } static struct devlink_health_reporter * devlink_health_reporter_find_by_name(struct devlink *devlink, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink->reporter_list, reporter_name); } static struct devlink_health_reporter * devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, reporter_name); } static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; if (WARN_ON(graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; } /** * devl_port_health_reporter_create() - create devlink health reporter for * specified port instance * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_assert_locked(port->devlink); if (__devlink_health_reporter_find_by_name(&port->reporter_list, ops->name)) return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(port->devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) return reporter; reporter->devlink_port = port; list_add_tail(&reporter->list, &port->reporter_list); return reporter; } EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); reporter = devl_port_health_reporter_create(port, ops, graceful_period, priv); devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); /** * devl_health_reporter_create - create devlink health reporter * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_assert_locked(devlink); if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) return reporter; list_add_tail(&reporter->list, &devlink->reporter_list); return reporter; } EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); reporter = devl_health_reporter_create(devlink, ops, graceful_period, priv); devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); static void devlink_health_reporter_free(struct devlink_health_reporter *reporter) { if (reporter->dump_fmsg) devlink_fmsg_free(reporter->dump_fmsg); kfree(reporter); } /** * devl_health_reporter_destroy() - destroy devlink health reporter * * @reporter: devlink health reporter to destroy */ void devl_health_reporter_destroy(struct devlink_health_reporter *reporter) { devl_assert_locked(reporter->devlink); list_del(&reporter->list); devlink_health_reporter_free(reporter); } EXPORT_SYMBOL_GPL(devl_health_reporter_destroy); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { struct devlink *devlink = reporter->devlink; devl_lock(devlink); devl_health_reporter_destroy(reporter); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, struct devlink_health_reporter *reporter, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink *devlink = reporter->devlink; struct nlattr *reporter_attr; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; if (reporter->devlink_port) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) goto genlmsg_cancel; } reporter_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_HEALTH_REPORTER); if (!reporter_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, reporter->ops->name)) goto reporter_nest_cancel; if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, reporter->health_state)) goto reporter_nest_cancel; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, reporter->error_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, reporter->recovery_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; if (reporter->dump_fmsg && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, jiffies_to_msecs(reporter->dump_ts), DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->dump_fmsg && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->ops->dump && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, reporter->auto_dump)) goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); return 0; reporter_nest_cancel: nla_nest_cancel(msg, reporter_attr); genlmsg_cancel: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_health_reporter * devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { struct devlink_port *devlink_port; char *reporter_name; if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); devlink_port = devlink_port_get_from_attrs(devlink, attrs); if (IS_ERR(devlink_port)) return devlink_health_reporter_find_by_name(devlink, reporter_name); else return devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); } static struct devlink_health_reporter * devlink_health_reporter_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_health_reporter_get_from_attrs(devlink, info->attrs); } int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; struct sk_buff *msg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; unsigned long port_index_end = ULONG_MAX; struct nlattr **attrs = info->attrs; unsigned long port_index_start = 0; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) { port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port_index_end = port_index_start; flags |= NLM_F_DUMP_FILTERED; goto per_port_dump; } list_for_each_entry(reporter, &devlink->reporter_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; return err; } idx++; } per_port_dump: xa_for_each_range(&devlink->ports, port_index, port, port_index_start, port_index_end) { list_for_each_entry(reporter, &port->reporter_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; return err; } idx++; } } return 0; } int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_health_reporter_get_dump_one); } int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) reporter->auto_dump = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); return 0; } static void devlink_recover_notify(struct devlink_health_reporter *reporter, enum devlink_command cmd) { struct devlink *devlink = reporter->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); ASSERT_DEVLINK_REGISTERED(devlink); if (!devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); if (reporter->devlink_port) devlink_nl_obj_desc_port_set(&desc, reporter->devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { int err; if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) return 0; if (!reporter->ops->recover) return -EOPNOTSUPP; err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); return 0; } static void devlink_health_dump_clear(struct devlink_health_reporter *reporter) { if (!reporter->dump_fmsg) return; devlink_fmsg_free(reporter->dump_fmsg); reporter->dump_fmsg = NULL; } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { int err; if (!reporter->ops->dump) return 0; if (reporter->dump_fmsg) return 0; reporter->dump_fmsg = devlink_fmsg_alloc(); if (!reporter->dump_fmsg) return -ENOMEM; devlink_fmsg_obj_nest_start(reporter->dump_fmsg); err = reporter->ops->dump(reporter, reporter->dump_fmsg, priv_ctx, extack); if (err) goto dump_err; devlink_fmsg_obj_nest_end(reporter->dump_fmsg); err = reporter->dump_fmsg->err; if (err) goto dump_err; reporter->dump_ts = jiffies; reporter->dump_real_ts = ktime_get_real_ns(); return 0; dump_err: devlink_health_dump_clear(reporter); return err; } int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; unsigned long recover_ts_threshold; int ret; /* write a log message of the current error */ WARN_ON(!msg); trace_devlink_health_report(devlink, reporter->ops->name, msg); reporter->error_count++; prev_health_state = reporter->health_state; reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ recover_ts_threshold = reporter->last_recovery_ts + msecs_to_jiffies(reporter->graceful_period); if (reporter->auto_recover && (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || (reporter->last_recovery_ts && reporter->recovery_count && time_is_after_jiffies(recover_ts_threshold)))) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, jiffies - reporter->last_recovery_ts); return -ECANCELED; } if (reporter->auto_dump) { devl_lock(devlink); /* store current dump of current error, for later analysis */ devlink_health_do_dump(reporter, priv_ctx, NULL); devl_unlock(devlink); } if (!reporter->auto_recover) return 0; devl_lock(devlink); ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); devl_unlock(devlink); return ret; } EXPORT_SYMBOL_GPL(devlink_health_report); void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state) { if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) return; if (reporter->health_state == state) return; reporter->health_state = state; trace_devlink_health_reporter_state_update(reporter->devlink, reporter->ops->name, state); devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); } EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; return devlink_health_reporter_recover(reporter, NULL, info->extack); } static void devlink_fmsg_err_if_binary(struct devlink_fmsg *fmsg) { if (!fmsg->err && fmsg->putting_binary) fmsg->err = -EINVAL; } static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype) { struct devlink_fmsg_item *item; if (fmsg->err) return; item = kzalloc(sizeof(*item), GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->attrtype = attrtype; list_add_tail(&item->list, &fmsg->item_list); } void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); static void devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); } void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); #define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) { struct devlink_fmsg_item *item; devlink_fmsg_err_if_binary(fmsg); if (fmsg->err) return; if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) { fmsg->err = -EMSGSIZE; return; } item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->nla_type = NLA_NUL_STRING; item->len = strlen(name) + 1; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; memcpy(&item->value, name, item->len); list_add_tail(&item->list, &fmsg->item_list); } void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); devlink_fmsg_put_name(fmsg, name); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_arr_pair_nest_start(fmsg, name); fmsg->putting_binary = true; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) { if (fmsg->err) return; if (!fmsg->putting_binary) fmsg->err = -EINVAL; fmsg->putting_binary = false; devlink_fmsg_arr_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg, const void *value, u16 value_len, u8 value_nla_type) { struct devlink_fmsg_item *item; if (fmsg->err) return; if (value_len > DEVLINK_FMSG_MAX_SIZE) { fmsg->err = -EMSGSIZE; return; } item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->nla_type = value_nla_type; item->len = value_len; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; memcpy(&item->value, value, item->len); list_add_tail(&item->list, &fmsg->item_list); } static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); } static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); } void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); } void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, u16 value_len) { if (!fmsg->err && !fmsg->putting_binary) fmsg->err = -EINVAL; devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_bool_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, u8 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u8_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, u32 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u32_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, u64 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u64_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, const char *value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_string_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, const void *value, u32 value_len) { u32 data_size; u32 offset; devlink_fmsg_binary_pair_nest_start(fmsg, name); for (offset = 0; offset < value_len; offset += data_size) { data_size = value_len - offset; if (data_size > DEVLINK_FMSG_MAX_SIZE) data_size = DEVLINK_FMSG_MAX_SIZE; devlink_fmsg_binary_put(fmsg, value + offset, data_size); } devlink_fmsg_binary_pair_nest_end(fmsg); fmsg->putting_binary = false; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) { switch (msg->nla_type) { case NLA_FLAG: case NLA_U8: case NLA_U32: case NLA_U64: case NLA_NUL_STRING: case NLA_BINARY: return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, msg->nla_type); default: return -EINVAL; } } static int devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) { int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; u8 tmp; switch (msg->nla_type) { case NLA_FLAG: /* Always provide flag data, regardless of its value */ tmp = *(bool *)msg->value; return nla_put_u8(skb, attrtype, tmp); case NLA_U8: return nla_put_u8(skb, attrtype, *(u8 *)msg->value); case NLA_U32: return nla_put_u32(skb, attrtype, *(u32 *)msg->value); case NLA_U64: return nla_put_u64_64bit(skb, attrtype, *(u64 *)msg->value, DEVLINK_ATTR_PAD); case NLA_NUL_STRING: return nla_put_string(skb, attrtype, (char *)&msg->value); case NLA_BINARY: return nla_put(skb, attrtype, msg->len, (void *)&msg->value); default: return -EINVAL; } } static int devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, int *start) { struct devlink_fmsg_item *item; struct nlattr *fmsg_nlattr; int err = 0; int i = 0; fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); if (!fmsg_nlattr) return -EMSGSIZE; list_for_each_entry(item, &fmsg->item_list, list) { if (i < *start) { i++; continue; } switch (item->attrtype) { case DEVLINK_ATTR_FMSG_OBJ_NEST_START: case DEVLINK_ATTR_FMSG_PAIR_NEST_START: case DEVLINK_ATTR_FMSG_ARR_NEST_START: case DEVLINK_ATTR_FMSG_NEST_END: err = nla_put_flag(skb, item->attrtype); break; case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: err = devlink_fmsg_item_fill_type(item, skb); if (err) break; err = devlink_fmsg_item_fill_data(item, skb); break; case DEVLINK_ATTR_FMSG_OBJ_NAME: err = nla_put_string(skb, item->attrtype, (char *)&item->value); break; default: err = -EINVAL; break; } if (!err) *start = ++i; else break; } nla_nest_end(skb, fmsg_nlattr); return err; } static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, struct genl_info *info, enum devlink_command cmd, int flags) { struct nlmsghdr *nlh; struct sk_buff *skb; bool last = false; int index = 0; void *hdr; int err; if (fmsg->err) return fmsg->err; while (!last) { int tmp_index = index; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, flags | NLM_F_MULTI, cmd); if (!hdr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_fmsg_prepare_skb(fmsg, skb, &index); if (!err) last = true; else if (err != -EMSGSIZE || tmp_index == index) goto nla_put_failure; genlmsg_end(skb, hdr); err = genlmsg_reply(skb, info); if (err) return err; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = -EMSGSIZE; goto nla_put_failure; } return genlmsg_reply(skb, info); nla_put_failure: nlmsg_free(skb); return err; } static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, struct netlink_callback *cb, enum devlink_command cmd) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); int index = state->idx; int tmp_index = index; void *hdr; int err; if (fmsg->err) return fmsg->err; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); if (!hdr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_fmsg_prepare_skb(fmsg, skb, &index); if ((err && err != -EMSGSIZE) || tmp_index == index) goto nla_put_failure; state->idx = index; genlmsg_end(skb, hdr); return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return err; } int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; struct devlink_fmsg *fmsg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->diagnose) return -EOPNOTSUPP; fmsg = devlink_fmsg_alloc(); if (!fmsg) return -ENOMEM; devlink_fmsg_obj_nest_start(fmsg); err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; devlink_fmsg_obj_nest_end(fmsg); err = devlink_fmsg_snd(fmsg, info, DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); out: devlink_fmsg_free(fmsg); return err; } static struct devlink_health_reporter * devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; struct nlattr **attrs = info->attrs; struct devlink *devlink; devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, false); if (IS_ERR(devlink)) return NULL; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); if (!reporter) { devl_unlock(devlink); devlink_put(devlink); } return reporter; } int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; struct devlink *devlink; int err; reporter = devlink_health_reporter_get_from_cb_lock(cb); if (!reporter) return -EINVAL; devlink = reporter->devlink; if (!reporter->ops->dump) { devl_unlock(devlink); devlink_put(devlink); return -EOPNOTSUPP; } if (!state->idx) { err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; state->dump_ts = reporter->dump_ts; } if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) { NL_SET_ERR_MSG(cb->extack, "Dump trampled, please retry"); err = -EAGAIN; goto unlock; } err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); unlock: devl_unlock(devlink); devlink_put(devlink); return err; } int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->dump) return -EOPNOTSUPP; devlink_health_dump_clear(reporter); return 0; } int devlink_nl_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->test) return -EOPNOTSUPP; return reporter->ops->test(reporter, info->extack); }
2 2 2 3 3 3 1 1 8 2 6 5 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "rsrc.h" #include "filetable.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) struct io_msg { struct file *file; struct file *src_file; struct callback_head tw; u64 user_data; u32 len; u32 cmd; u32 src_fd; union { u32 dst_fd; u32 cqe_flags; }; u32 flags; }; static void io_double_unlock_ctx(struct io_ring_ctx *octx) { mutex_unlock(&octx->uring_lock); } static int io_double_lock_ctx(struct io_ring_ctx *octx, unsigned int issue_flags) { /* * To ensure proper ordering between the two ctxs, we can only * attempt a trylock on the target. If that fails and we already have * the source ctx lock, punt to io-wq. */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { if (!mutex_trylock(&octx->uring_lock)) return -EAGAIN; return 0; } mutex_lock(&octx->uring_lock); return 0; } void io_msg_ring_cleanup(struct io_kiocb *req) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); if (WARN_ON_ONCE(!msg->src_file)) return; fput(msg->src_file); msg->src_file = NULL; } static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { if (!target_ctx->task_complete) return false; return current != target_ctx->submitter_task; } static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) { struct io_ring_ctx *ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct task_struct *task = READ_ONCE(ctx->submitter_task); if (unlikely(!task)) return -EOWNERDEAD; init_task_work(&msg->tw, func); if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL)) return -EOWNERDEAD; return IOU_ISSUE_SKIP_COMPLETE; } static void io_msg_tw_complete(struct callback_head *head) { struct io_msg *msg = container_of(head, struct io_msg, tw); struct io_kiocb *req = cmd_to_io_kiocb(msg); struct io_ring_ctx *target_ctx = req->file->private_data; int ret = 0; if (current->flags & PF_EXITING) { ret = -EOWNERDEAD; } else { u32 flags = 0; if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; /* * If the target ring is using IOPOLL mode, then we need to be * holding the uring_lock for posting completions. Other ring * types rely on the regular completion locking, which is * handled while posting. */ if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&target_ctx->uring_lock); if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&target_ctx->uring_lock); } if (ret < 0) req_set_fail(req); io_req_queue_tw_complete(req, ret); } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); u32 flags = 0; int ret; if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS) return -EINVAL; if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; if (io_msg_need_remote(target_ctx)) return io_msg_exec_remote(req, io_msg_tw_complete); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; io_double_unlock_ctx(target_ctx); } else { if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; } return ret; } static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; struct file *file = NULL; int idx = msg->src_fd; io_ring_submit_lock(ctx, issue_flags); if (likely(idx < ctx->nr_user_files)) { idx = array_index_nospec(idx, ctx->nr_user_files); file = io_file_from_index(&ctx->file_table, idx); if (file) get_file(file); } io_ring_submit_unlock(ctx, issue_flags); return file; } static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct file *src_file = msg->src_file; int ret; if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); if (ret < 0) goto out_unlock; msg->src_file = NULL; req->flags &= ~REQ_F_NEED_CLEANUP; if (msg->flags & IORING_MSG_RING_CQE_SKIP) goto out_unlock; /* * If this fails, the target still received the file descriptor but * wasn't notified of the fact. This means that if this request * completes with -EOVERFLOW, then the sender must ensure that a * later IORING_OP_MSG_RING delivers the message. */ if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0)) ret = -EOVERFLOW; out_unlock: io_double_unlock_ctx(target_ctx); return ret; } static void io_msg_tw_fd_complete(struct callback_head *head) { struct io_msg *msg = container_of(head, struct io_msg, tw); struct io_kiocb *req = cmd_to_io_kiocb(msg); int ret = -EOWNERDEAD; if (!(current->flags & PF_EXITING)) ret = io_msg_install_complete(req, IO_URING_F_UNLOCKED); if (ret < 0) req_set_fail(req); io_req_queue_tw_complete(req, ret); } static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; struct file *src_file = msg->src_file; if (msg->len) return -EINVAL; if (target_ctx == ctx) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; if (!src_file) { src_file = io_msg_grab_file(req, issue_flags); if (!src_file) return -EBADF; msg->src_file = src_file; req->flags |= REQ_F_NEED_CLEANUP; } if (io_msg_need_remote(target_ctx)) return io_msg_exec_remote(req, io_msg_tw_fd_complete); return io_msg_install_complete(req, issue_flags); } int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; msg->src_file = NULL; msg->user_data = READ_ONCE(sqe->off); msg->len = READ_ONCE(sqe->len); msg->cmd = READ_ONCE(sqe->addr); msg->src_fd = READ_ONCE(sqe->addr3); msg->dst_fd = READ_ONCE(sqe->file_index); msg->flags = READ_ONCE(sqe->msg_ring_flags); if (msg->flags & ~IORING_MSG_RING_MASK) return -EINVAL; return 0; } int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); int ret; ret = -EBADFD; if (!io_is_uring_fops(req->file)) goto done; switch (msg->cmd) { case IORING_MSG_DATA: ret = io_msg_ring_data(req, issue_flags); break; case IORING_MSG_SEND_FD: ret = io_msg_send_fd(req, issue_flags); break; default: ret = -EINVAL; break; } done: if (ret < 0) { if (ret == -EAGAIN || ret == IOU_ISSUE_SKIP_COMPLETE) return ret; req_set_fail(req); } io_req_set_res(req, ret, 0); return IOU_OK; }
2 1 3 4 3 5 5 4 5 8 8 2 17 18 18 18 5 4 13 7 2 1 3 1 3 18 16 2 16 16 16 2 2 2 321 321 16 1 31 1 1 7 4 14 1 4 17 15 2 2 2 2 14 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "tctx.h" #include "poll.h" #include "timeout.h" #include "waitid.h" #include "futex.h" #include "cancel.h" struct io_cancel { struct file *file; u64 addr; u32 flags; s32 fd; u8 opcode; }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) /* * Returns true if the request matches the criteria outlined by 'cd'. */ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) { bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; if (req->ctx != cd->ctx) return false; if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) match_user_data = true; if (cd->flags & IORING_ASYNC_CANCEL_ANY) goto check_seq; if (cd->flags & IORING_ASYNC_CANCEL_FD) { if (req->file != cd->file) return false; } if (cd->flags & IORING_ASYNC_CANCEL_OP) { if (req->opcode != cd->opcode) return false; } if (match_user_data && req->cqe.user_data != cd->data) return false; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { check_seq: if (io_cancel_match_sequence(req, cd->seq)) return false; } return true; } static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; return io_cancel_req_match(req, cd); } static int io_async_cancel_one(struct io_uring_task *tctx, struct io_cancel_data *cd) { enum io_wq_cancel cancel_ret; int ret = 0; bool all; if (!tctx || !tctx->io_wq) return -ENOENT; all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; break; case IO_WQ_CANCEL_RUNNING: ret = -EALREADY; break; case IO_WQ_CANCEL_NOTFOUND: ret = -ENOENT; break; } return ret; } int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned issue_flags) { struct io_ring_ctx *ctx = cd->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring); ret = io_async_cancel_one(tctx, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. */ if (!ret) return 0; ret = io_poll_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_waitid_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_futex_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); spin_unlock(&ctx->completion_lock); return ret; } int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sqe->off || sqe->splice_fd_in) return -EINVAL; cancel->addr = READ_ONCE(sqe->addr); cancel->flags = READ_ONCE(sqe->cancel_flags); if (cancel->flags & ~CANCEL_FLAGS) return -EINVAL; if (cancel->flags & IORING_ASYNC_CANCEL_FD) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->opcode = READ_ONCE(sqe->len); } return 0; } static int __io_async_cancel(struct io_cancel_data *cd, struct io_uring_task *tctx, unsigned int issue_flags) { bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; int ret, nr = 0; do { ret = io_try_cancel(tctx, cd, issue_flags); if (ret == -ENOENT) break; if (!all) return ret; nr++; } while (1); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; ret = io_async_cancel_one(tctx, cd); if (ret != -ENOENT) { if (!all) break; nr++; } } io_ring_submit_unlock(ctx, issue_flags); return all ? nr : ret; } int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); struct io_cancel_data cd = { .ctx = req->ctx, .data = cancel->addr, .flags = cancel->flags, .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; struct io_uring_task *tctx = req->task->io_uring; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { if (req->flags & REQ_F_FIXED_FILE || cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) { req->flags |= REQ_F_FIXED_FILE; req->file = io_file_get_fixed(req, cancel->fd, issue_flags); } else { req->file = io_file_get_normal(req, cancel->fd); } if (!req->file) { ret = -EBADF; goto done; } cd.file = req->file; } ret = __io_async_cancel(&cd, tctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } void init_hash_table(struct io_hash_table *table, unsigned size) { unsigned int i; for (i = 0; i < size; i++) { spin_lock_init(&table->hbs[i].lock); INIT_HLIST_HEAD(&table->hbs[i].list); } } static int __io_sync_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int fd) { struct io_ring_ctx *ctx = cd->ctx; /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { if (unlikely(fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); cd->file = io_file_from_index(&ctx->file_table, fd); if (!cd->file) return -EBADF; } return __io_async_cancel(cd, tctx, 0); } int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_cancel_data cd = { .ctx = ctx, .seq = atomic_inc_return(&ctx->cancel_seq), }; ktime_t timeout = KTIME_MAX; struct io_uring_sync_cancel_reg sc; struct file *file = NULL; DEFINE_WAIT(wait); int ret, i; if (copy_from_user(&sc, arg, sizeof(sc))) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad); i++) if (sc.pad[i]) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) if (sc.pad2[i]) return -EINVAL; cd.data = sc.addr; cd.flags = sc.flags; cd.opcode = sc.opcode; /* we can grab a normal file descriptor upfront */ if ((cd.flags & IORING_ASYNC_CANCEL_FD) && !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { file = fget(sc.fd); if (!file) return -EBADF; cd.file = file; } ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); /* found something, done! */ if (ret != -EALREADY) goto out; if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) { struct timespec64 ts = { .tv_sec = sc.timeout.tv_sec, .tv_nsec = sc.timeout.tv_nsec }; timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } /* * Keep looking until we get -ENOENT. we'll get woken everytime * every time a request completes and will retry the cancelation. */ do { cd.seq = atomic_inc_return(&ctx->cancel_seq); prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE); ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); mutex_unlock(&ctx->uring_lock); if (ret != -EALREADY) break; ret = io_run_task_work_sig(ctx); if (ret < 0) break; ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); if (!ret) { ret = -ETIME; break; } mutex_lock(&ctx->uring_lock); } while (1); finish_wait(&ctx->cq_wait, &wait); mutex_lock(&ctx->uring_lock); if (ret == -ENOENT || ret > 0) ret = 0; out: if (file) fput(file); return ret; }
43 42 33 2 2 1864 1820 45 12 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 // SPDX-License-Identifier: GPL-2.0-or-later /* * "LAPB via ethernet" driver release 001 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * This is a "pseudo" network driver to allow LAPB over Ethernet. * * This driver can use any ethernet destination address, and can be * limited to accept frames from one dedicated ethernet card only. * * History * LAPBETH 001 Jonathan Naylor Cloned from bpqether.c * 2000-10-29 Henner Eisen lapb_data_indication() return status. * 2000-11-14 Henner Eisen dev_hold/put, NETDEV_GOING_DOWN support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/lapb.h> #include <linux/init.h> #include <net/x25device.h> static const u8 bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; /* If this number is made larger, check that the temporary string buffer * in lapbeth_new_device is large enough to store the probe device name. */ #define MAXLAPBDEV 100 struct lapbethdev { struct list_head node; struct net_device *ethdev; /* link to ethernet device */ struct net_device *axdev; /* lapbeth device (lapb#) */ bool up; spinlock_t up_lock; /* Protects "up" */ struct sk_buff_head rx_queue; struct napi_struct napi; }; static LIST_HEAD(lapbeth_devices); static void lapbeth_connected(struct net_device *dev, int reason); static void lapbeth_disconnected(struct net_device *dev, int reason); /* ------------------------------------------------------------------------ */ /* Get the LAPB device for the ethernet device */ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) { struct lapbethdev *lapbeth; list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node, lockdep_rtnl_is_held()) { if (lapbeth->ethdev == dev) return lapbeth; } return NULL; } static __inline__ int dev_is_ethdev(struct net_device *dev) { return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); } /* ------------------------------------------------------------------------ */ static int lapbeth_napi_poll(struct napi_struct *napi, int budget) { struct lapbethdev *lapbeth = container_of(napi, struct lapbethdev, napi); struct sk_buff *skb; int processed = 0; for (; processed < budget; ++processed) { skb = skb_dequeue(&lapbeth->rx_queue); if (!skb) break; netif_receive_skb_core(skb); } if (processed < budget) napi_complete(napi); return processed; } /* Receive a LAPB frame via an ethernet interface. */ static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; if (dev_net(dev) != &init_net) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, 2)) goto drop; rcu_read_lock(); lapbeth = lapbeth_get_x25_dev(dev); if (!lapbeth) goto drop_unlock_rcu; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop_unlock; len = skb->data[0] + skb->data[1] * 256; dev->stats.rx_packets++; dev->stats.rx_bytes += len; skb_pull(skb, 2); /* Remove the length bytes */ skb_trim(skb, len); /* Set the length of the data */ err = lapb_data_received(lapbeth->axdev, skb); if (err != LAPB_OK) { printk(KERN_DEBUG "lapbether: lapb_data_received err - %d\n", err); goto drop_unlock; } out: spin_unlock_bh(&lapbeth->up_lock); rcu_read_unlock(); return 0; drop_unlock: kfree_skb(skb); goto out; drop_unlock_rcu: rcu_read_unlock(); drop: kfree_skb(skb); return 0; } static int lapbeth_data_indication(struct net_device *dev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; if (skb_cow(skb, 1)) { kfree_skb(skb); return NET_RX_DROP; } skb_push(skb, 1); ptr = skb->data; *ptr = X25_IFACE_DATA; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); return NET_RX_SUCCESS; } /* Send a LAPB frame via an ethernet interface */ static netdev_tx_t lapbeth_xmit(struct sk_buff *skb, struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop; /* There should be a pseudo header of 1 byte added by upper layers. * Check to make sure it is there before reading it. */ if (skb->len < 1) goto drop; switch (skb->data[0]) { case X25_IFACE_DATA: break; case X25_IFACE_CONNECT: err = lapb_connect_request(dev); if (err == LAPB_CONNECTED) lapbeth_connected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_connect_request error: %d\n", err); goto drop; case X25_IFACE_DISCONNECT: err = lapb_disconnect_request(dev); if (err == LAPB_NOTCONNECTED) lapbeth_disconnected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_disconnect_request err: %d\n", err); fallthrough; default: goto drop; } skb_pull(skb, 1); err = lapb_data_request(dev, skb); if (err != LAPB_OK) { pr_err("lapb_data_request error - %d\n", err); goto drop; } out: spin_unlock_bh(&lapbeth->up_lock); return NETDEV_TX_OK; drop: kfree_skb(skb); goto out; } static void lapbeth_data_transmit(struct net_device *ndev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(ndev); unsigned char *ptr; struct net_device *dev; int size = skb->len; ptr = skb_push(skb, 2); *ptr++ = size % 256; *ptr++ = size / 256; ndev->stats.tx_packets++; ndev->stats.tx_bytes += size; skb->dev = dev = lapbeth->ethdev; skb->protocol = htons(ETH_P_DEC); skb_reset_network_header(skb); dev_hard_header(skb, dev, ETH_P_DEC, bcast_addr, NULL, 0); dev_queue_xmit(skb); } static void lapbeth_connected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } static void lapbeth_disconnected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_DISCONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } /* Set AX.25 callsign */ static int lapbeth_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; dev_addr_set(dev, sa->sa_data); return 0; } static const struct lapb_register_struct lapbeth_callbacks = { .connect_confirmation = lapbeth_connected, .connect_indication = lapbeth_connected, .disconnect_confirmation = lapbeth_disconnected, .disconnect_indication = lapbeth_disconnected, .data_indication = lapbeth_data_indication, .data_transmit = lapbeth_data_transmit, }; /* open/close a device */ static int lapbeth_open(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; napi_enable(&lapbeth->napi); err = lapb_register(dev, &lapbeth_callbacks); if (err != LAPB_OK) { napi_disable(&lapbeth->napi); pr_err("lapb_register error: %d\n", err); return -ENODEV; } spin_lock_bh(&lapbeth->up_lock); lapbeth->up = true; spin_unlock_bh(&lapbeth->up_lock); return 0; } static int lapbeth_close(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); lapbeth->up = false; spin_unlock_bh(&lapbeth->up_lock); err = lapb_unregister(dev); if (err != LAPB_OK) pr_err("lapb_unregister error: %d\n", err); napi_disable(&lapbeth->napi); return 0; } /* ------------------------------------------------------------------------ */ static const struct net_device_ops lapbeth_netdev_ops = { .ndo_open = lapbeth_open, .ndo_stop = lapbeth_close, .ndo_start_xmit = lapbeth_xmit, .ndo_set_mac_address = lapbeth_set_mac_address, }; static void lapbeth_setup(struct net_device *dev) { dev->netdev_ops = &lapbeth_netdev_ops; dev->needs_free_netdev = true; dev->type = ARPHRD_X25; dev->hard_header_len = 0; dev->mtu = 1000; dev->addr_len = 0; } /* Setup a new device. */ static int lapbeth_new_device(struct net_device *dev) { struct net_device *ndev; struct lapbethdev *lapbeth; int rc = -ENOMEM; ASSERT_RTNL(); if (dev->type != ARPHRD_ETHER) return -EINVAL; ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", NET_NAME_UNKNOWN, lapbeth_setup); if (!ndev) goto out; /* When transmitting data: * first this driver removes a pseudo header of 1 byte, * then the lapb module prepends an LAPB header of at most 3 bytes, * then this driver prepends a length field of 2 bytes, * then the underlying Ethernet device prepends its own header. */ ndev->needed_headroom = -1 + 3 + 2 + dev->hard_header_len + dev->needed_headroom; ndev->needed_tailroom = dev->needed_tailroom; lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; dev_hold(dev); lapbeth->ethdev = dev; lapbeth->up = false; spin_lock_init(&lapbeth->up_lock); skb_queue_head_init(&lapbeth->rx_queue); netif_napi_add_weight(ndev, &lapbeth->napi, lapbeth_napi_poll, 16); rc = -EIO; if (register_netdevice(ndev)) goto fail; list_add_rcu(&lapbeth->node, &lapbeth_devices); rc = 0; out: return rc; fail: dev_put(dev); free_netdev(ndev); goto out; } /* Free a lapb network device. */ static void lapbeth_free_device(struct lapbethdev *lapbeth) { dev_put(lapbeth->ethdev); list_del_rcu(&lapbeth->node); unregister_netdevice(lapbeth->axdev); } /* Handle device status changes. * * Called from notifier with RTNL held. */ static int lapbeth_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct lapbethdev *lapbeth; struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (!dev_is_ethdev(dev) && !lapbeth_get_x25_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: /* New ethernet device -> new LAPB interface */ if (!lapbeth_get_x25_dev(dev)) lapbeth_new_device(dev); break; case NETDEV_GOING_DOWN: /* ethernet device closes -> close LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) dev_close(lapbeth->axdev); break; case NETDEV_UNREGISTER: /* ethernet device disappears -> remove LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) lapbeth_free_device(lapbeth); break; } return NOTIFY_DONE; } /* ------------------------------------------------------------------------ */ static struct packet_type lapbeth_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DEC), .func = lapbeth_rcv, }; static struct notifier_block lapbeth_dev_notifier = { .notifier_call = lapbeth_device_event, }; static const char banner[] __initconst = KERN_INFO "LAPB Ethernet driver version 0.02\n"; static int __init lapbeth_init_driver(void) { dev_add_pack(&lapbeth_packet_type); register_netdevice_notifier(&lapbeth_dev_notifier); printk(banner); return 0; } module_init(lapbeth_init_driver); static void __exit lapbeth_cleanup_driver(void) { struct lapbethdev *lapbeth; struct list_head *entry, *tmp; dev_remove_pack(&lapbeth_packet_type); unregister_netdevice_notifier(&lapbeth_dev_notifier); rtnl_lock(); list_for_each_safe(entry, tmp, &lapbeth_devices) { lapbeth = list_entry(entry, struct lapbethdev, node); dev_put(lapbeth->ethdev); unregister_netdevice(lapbeth->axdev); } rtnl_unlock(); } module_exit(lapbeth_cleanup_driver); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The unofficial LAPB over Ethernet driver"); MODULE_LICENSE("GPL");
2 2 1 4 4 3 3 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); static inline int xfrm6_tunnel_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); switch (family) { case AF_INET6: pprev = &tunnel6_handlers; break; case AF_INET: pprev = &tunnel46_handlers; break; case AF_MPLS: pprev = &tunnelmpls6_handlers; break; default: goto err; } for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel6_mutex); return ret; } EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); switch (family) { case AF_INET6: pprev = &tunnel6_handlers; break; case AF_INET: pprev = &tunnel46_handlers; break; case AF_MPLS: pprev = &tunnelmpls6_handlers; break; default: goto err; } for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } err: mutex_unlock(&tunnel6_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm6_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnelmpls6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnelmpls6_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm6_tunnel __rcu *head; struct xfrm6_tunnel *handler; int ret; head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel6_input_afinfo = { .family = AF_INET6, .is_ipip = true, .callback = tunnel6_rcv_cb, }; #endif static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnelmpls6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static const struct inet6_protocol tunnel46_protocol = { .handler = tunnel46_rcv, .err_handler = tunnel46_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static const struct inet6_protocol tunnelmpls6_protocol = { .handler = tunnelmpls6_rcv, .err_handler = tunnelmpls6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } if (xfrm6_tunnel_mpls_supported() && inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) { pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); return -EAGAIN; } #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) { pr_err("%s: can't add input afinfo\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); if (xfrm6_tunnel_mpls_supported()) inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS); return -EAGAIN; } #endif return 0; } static void __exit tunnel6_fini(void) { #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo)) pr_err("%s: can't remove input afinfo\n", __func__); #endif if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) pr_err("%s: can't remove protocol\n", __func__); if (xfrm6_tunnel_mpls_supported() && inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); module_exit(tunnel6_fini); MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver"); MODULE_LICENSE("GPL");
16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 /* SPDX-License-Identifier: GPL-2.0-only */ /* * fence-chain: chain fences together in a timeline * * Copyright (C) 2018 Advanced Micro Devices, Inc. * Authors: * Christian König <christian.koenig@amd.com> */ #ifndef __LINUX_DMA_FENCE_CHAIN_H #define __LINUX_DMA_FENCE_CHAIN_H #include <linux/dma-fence.h> #include <linux/irq_work.h> #include <linux/slab.h> /** * struct dma_fence_chain - fence to represent an node of a fence chain * @base: fence base class * @prev: previous fence of the chain * @prev_seqno: original previous seqno before garbage collection * @fence: encapsulated fence * @lock: spinlock for fence handling */ struct dma_fence_chain { struct dma_fence base; struct dma_fence __rcu *prev; u64 prev_seqno; struct dma_fence *fence; union { /** * @cb: callback for signaling * * This is used to add the callback for signaling the * complection of the fence chain. Never used at the same time * as the irq work. */ struct dma_fence_cb cb; /** * @work: irq work item for signaling * * Irq work structure to allow us to add the callback without * running into lock inversion. Never used at the same time as * the callback. */ struct irq_work work; }; spinlock_t lock; }; /** * to_dma_fence_chain - cast a fence to a dma_fence_chain * @fence: fence to cast to a dma_fence_array * * Returns NULL if the fence is not a dma_fence_chain, * or the dma_fence_chain otherwise. */ static inline struct dma_fence_chain * to_dma_fence_chain(struct dma_fence *fence) { if (!fence || !dma_fence_is_chain(fence)) return NULL; return container_of(fence, struct dma_fence_chain, base); } /** * dma_fence_chain_contained - return the contained fence * @fence: the fence to test * * If the fence is a dma_fence_chain the function returns the fence contained * inside the chain object, otherwise it returns the fence itself. */ static inline struct dma_fence * dma_fence_chain_contained(struct dma_fence *fence) { struct dma_fence_chain *chain = to_dma_fence_chain(fence); return chain ? chain->fence : fence; } /** * dma_fence_chain_alloc * * Returns a new struct dma_fence_chain object or NULL on failure. */ static inline struct dma_fence_chain *dma_fence_chain_alloc(void) { return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); }; /** * dma_fence_chain_free * @chain: chain node to free * * Frees up an allocated but not used struct dma_fence_chain object. This * doesn't need an RCU grace period since the fence was never initialized nor * published. After dma_fence_chain_init() has been called the fence must be * released by calling dma_fence_put(), and not through this function. */ static inline void dma_fence_chain_free(struct dma_fence_chain *chain) { kfree(chain); }; /** * dma_fence_chain_for_each - iterate over all fences in chain * @iter: current fence * @head: starting point * * Iterate over all fences in the chain. We keep a reference to the current * fence while inside the loop which must be dropped when breaking out. * * For a deep dive iterator see dma_fence_unwrap_for_each(). */ #define dma_fence_chain_for_each(iter, head) \ for (iter = dma_fence_get(head); iter; \ iter = dma_fence_chain_walk(iter)) struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence); int dma_fence_chain_find_seqno(struct dma_fence **pfence, uint64_t seqno); void dma_fence_chain_init(struct dma_fence_chain *chain, struct dma_fence *prev, struct dma_fence *fence, uint64_t seqno); #endif /* __LINUX_DMA_FENCE_CHAIN_H */
6 1 1 1 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-only /* IP tables module for matching IPsec policy * * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <net/xfrm.h> #include <linux/netfilter.h> #include <linux/netfilter/xt_policy.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: IPsec policy match"); MODULE_LICENSE("GPL"); static inline bool xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, const union nf_inet_addr *a2, unsigned short family) { switch (family) { case NFPROTO_IPV4: return ((a1->ip ^ a2->ip) & m->ip) == 0; case NFPROTO_IPV6: return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } return false; } static bool match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, unsigned short family) { #define MATCH_ADDR(x,y,z) (!e->match.x || \ (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ ^ e->invert.x)) #define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) return MATCH_ADDR(saddr, smask, &x->props.saddr) && MATCH_ADDR(daddr, dmask, &x->id.daddr) && MATCH(proto, x->id.proto) && MATCH(mode, x->props.mode) && MATCH(spi, x->id.spi) && MATCH(reqid, x->props.reqid); } static int match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; const struct sec_path *sp = skb_sec_path(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; if (sp == NULL) return -1; if (strict && info->len != sp->len) return 0; for (i = sp->len - 1; i >= 0; i--) { pos = strict ? i - sp->len + 1 : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; if (match_xfrm_state(sp->xvec[i], e, family)) { if (!strict) return 1; } else if (strict) return 0; } return strict ? 1 : 0; } static int match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; const struct dst_entry *dst = skb_dst(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; if (dst->xfrm == NULL) return -1; for (i = 0; dst && dst->xfrm; dst = ((struct xfrm_dst *)dst)->child, i++) { pos = strict ? i : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; if (match_xfrm_state(dst->xfrm, e, family)) { if (!strict) return 1; } else if (strict) return 0; } return strict ? i == info->len : 0; } static bool policy_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_policy_info *info = par->matchinfo; int ret; if (info->flags & XT_POLICY_MATCH_IN) ret = match_policy_in(skb, info, xt_family(par)); else ret = match_policy_out(skb, info, xt_family(par)); if (ret < 0) ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; else if (info->flags & XT_POLICY_MATCH_NONE) ret = false; return ret; } static int policy_mt_check(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; const char *errmsg = "neither incoming nor outgoing policy selected"; if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) goto err; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { errmsg = "output policy not valid in PREROUTING and INPUT"; goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; } return 0; err: pr_info_ratelimited("%s\n", errmsg); return -EINVAL; } static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, { .name = "policy", .family = NFPROTO_IPV6, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, }; static int __init policy_mt_init(void) { return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } static void __exit policy_mt_exit(void) { xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } module_init(policy_mt_init); module_exit(policy_mt_exit); MODULE_ALIAS("ipt_policy"); MODULE_ALIAS("ip6t_policy");
4 20 17 49 49 4 4 4 3 4 4 4 3 4 119 119 119 41 41 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 13 13 2 2 1 1 1 1 2 1 1 3 2 1 1 1 2 1 1 1 1 1 1 1 1 13 13 13 13 13 13 13 13 13 13 13 13 1871 1850 16 13 13 119 119 119 1895 1895 1872 119 1895 1874 119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" #define DEVLINK_PORT_FN_CAPS_VALID_MASK \ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ WARN_ON_ONCE((devlink_port)->registered) struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { return xa_load(&devlink->ports, port_index); } struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return ERR_PTR(-ENODEV); return devlink_port; } return ERR_PTR(-EINVAL); } struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, u32 cap, bool is_enable) { caps->selector |= cap; if (is_enable) caps->value |= cap; } static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_roce_get) return 0; err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); return 0; } static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_migratable_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); return 0; } static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_crypto_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); return 0; } static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_packet_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); return 0; } static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { struct nla_bitfield32 caps = {}; int err; err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); if (err) return err; if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, caps.selector); if (err) return err; *msg_updated = true; return 0; } int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) return -EMSGSIZE; return 0; } size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (!devlink_port->attrs_set) return 0; if (attrs->lanes) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_pf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_vf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_sf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_sf.pf) || nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, attrs->phys.split_subport_number)) return -EMSGSIZE; break; default: break; } return 0; } static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; int err; if (!port->ops->port_fn_hw_addr_get) return 0; err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); if (err) return err; *msg_updated = true; return 0; } static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { return state == DEVLINK_PORT_FN_STATE_INACTIVE || state == DEVLINK_PORT_FN_STATE_ACTIVE; } static bool devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) { return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; int err; if (!port->ops->port_fn_state_get) return 0; err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) return -EMSGSIZE; *msg_updated = true; return 0; } static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_roce_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nla_bitfield32 caps; u32 caps_value; int err; caps = nla_get_bitfield32(attr); caps_value = caps.value & caps.selector; if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { err = devlink_port_fn_roce_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_ROCE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { err = devlink_port_fn_mig_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_MIGRATABLE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_PACKET, extack); if (err) return err; } return 0; } static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct nlattr *function_attr; bool msg_updated = false; int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); return err; } static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; if (devlink_port->linecard && nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, devlink_linecard_index(devlink_port->linecard))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); devlink_nl_obj_desc_port_set(&desc, devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } static void devlink_ports_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_port *devlink_port; unsigned long port_index; xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, cmd); } void devlink_ports_notify_register(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); } void devlink_ports_notify_unregister(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); } int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; unsigned long port_index; int err = 0; xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err) { state->idx = port_index; break; } } return err; } int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); } static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; devlink_port->desired_type = port_type; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { const u8 *hw_addr; int hw_addr_len; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; state = nla_get_u8(attr); return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; } if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE], "Function does not support state setting"); return -EOPNOTSUPP; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { struct nla_bitfield32 caps; caps = nla_get_bitfield32(attr); if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && !ops->port_fn_roce_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { if (!ops->port_fn_migratable_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support migratable function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "migratable function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { if (!ops->port_fn_ipsec_crypto_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_crypto function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_crypto function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { if (!ops->port_fn_ipsec_packet_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_packet function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_packet function attribute supported for VFs only"); return -EOPNOTSUPP; } } } return 0; } static int devlink_port_function_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } err = devlink_port_function_validate(port, tb, extack); if (err) return err; attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { err = devlink_port_fn_caps_set(port, attr, extack); if (err) return err; } /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } return 0; } int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; u32 count; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } return devlink_port->ops->port_split(devlink, devlink_port, count, info->extack); } int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; struct devlink_port *devlink_port; struct sk_buff *msg; int err; if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); new_attrs.pfnum = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { /* Port index of the new port being created by driver. */ new_attrs.port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); new_attrs.port_index_valid = true; } if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { new_attrs.controller = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); new_attrs.controller_valid = true; } if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); new_attrs.sfnum_valid = true; } err = devlink->ops->port_new(devlink, &new_attrs, extack, &devlink_port); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err_out_port_del; } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, info->snd_portid, info->snd_seq, 0, NULL); if (WARN_ON_ONCE(err)) goto err_out_msg_free; err = genlmsg_reply(msg, info); if (err) goto err_out_port_del; return 0; err_out_msg_free: nlmsg_free(msg); err_out_port_del: devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_del) return -EOPNOTSUPP; return devlink_port->ops->port_del(devlink, devlink_port, extack); } static void devlink_port_type_warn(struct work_struct *work) { struct devlink_port *port = container_of(to_delayed_work(work), struct devlink_port, type_warn_dw); dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; /* Schedule a work to WARN in case driver does not set port * type within timeout. */ schedule_delayed_work(&devlink_port->type_warn_dw, DEVLINK_PORT_TYPE_WARN_TIMEOUT); } static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; cancel_delayed_work_sync(&devlink_port->type_warn_dw); } /** * devlink_port_init() - Init devlink port * * @devlink: devlink * @devlink_port: devlink port * * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_init(struct devlink *devlink, struct devlink_port *devlink_port) { if (devlink_port->initialized) return; devlink_port->devlink = devlink; INIT_LIST_HEAD(&devlink_port->region_list); devlink_port->initialized = true; } EXPORT_SYMBOL_GPL(devlink_port_init); /** * devlink_port_fini() - Deinitialize devlink port * * @devlink_port: devlink port * * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_fini(struct devlink_port *devlink_port) { WARN_ON(!list_empty(&devlink_port->region_list)); } EXPORT_SYMBOL_GPL(devlink_port_fini); static const struct devlink_port_ops devlink_port_dummy_ops = {}; /** * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. */ int devl_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); if (err) { devlink_port->registered = false; return err; } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. * * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_lock(devlink); err = devl_port_register_with_ops(devlink, devlink_port, port_index, ops); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port * * @devlink_port: devlink port */ void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); /** * devlink_port_unregister - Unregister devlink port * * @devlink_port: devlink port * * Context: Takes and release devlink->lock <mutex>. */ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; devl_lock(devlink); devl_port_unregister(devlink_port); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; /* If driver registers devlink port, it should set devlink port * attributes accordingly so the compat functions are called * and the original ops are not used. */ if (ops->ndo_get_phys_port_name) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_phys_port_name * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ char name[IFNAMSIZ]; int err; err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); WARN_ON(err != -EOPNOTSUPP); } if (ops->ndo_get_port_parent_id) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_port_parent_id * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ struct netdev_phys_item_id ppid; int err; err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } } static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { struct net_device *netdev = type_dev; ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (type == DEVLINK_PORT_TYPE_NOTSET) { devlink_port_type_warn_schedule(devlink_port); } else { devlink_port_type_warn_cancel(devlink_port); if (type == DEVLINK_PORT_TYPE_ETH && netdev) devlink_port_type_netdev_checks(devlink_port, netdev); } spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; if (netdev) { ASSERT_RTNL(); devlink_port->type_eth.ifindex = netdev->ifindex; BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != sizeof(netdev->name)); strcpy(devlink_port->type_eth.ifname, netdev->name); } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; break; default: break; } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port * * If driver is calling this, most likely it is doing something wrong. */ void devlink_port_type_eth_set(struct devlink_port *devlink_port) { dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); /** * devlink_port_type_ib_set - Set port type to InfiniBand * * @devlink_port: devlink port * @ibdev: related IB device */ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); /** * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port * * If driver is calling this for clearing Ethernet type, most likely * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; if (!devlink_port) return NOTIFY_OK; devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: /* Set the type but not netdev pointer. It is going to be set * later on by NETDEV_REGISTER event. Happens once during * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); break; } return NOTIFY_OK; } static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { devlink_port->switch_port = true; if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { devlink_port->switch_port = false; } return 0; } /** * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); /** * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); /** * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); /** * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) return; attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, u32 rel_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (devlink_port && devlink_port->rel_index == rel_index) devlink_port->rel_index = 0; } /** * devl_port_fn_devlink_set - Attach peer devlink * instance to port function. * @devlink_port: devlink port * @fn_devlink: devlink instance to attach */ int devl_port_fn_devlink_set(struct devlink_port *devlink_port, struct devlink *fn_devlink) { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || devlink_port->attrs.pci_sf.external)) return -EINVAL; return devlink_rel_nested_in_add(&devlink_port->rel_index, devlink_port->devlink->index, devlink_port->index, devlink_port_rel_notify_cb, devlink_port_rel_cleanup_cb, fn_devlink); } EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); /** * devlink_port_linecard_set - Link port with a linecard * * @devlink_port: devlink port * @linecard: devlink linecard */ void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (devlink_port->linecard) n = snprintf(name, len, "l%u", devlink_linecard_index(devlink_port->linecard)); if (n < len) n += snprintf(name + n, len - n, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: if (attrs->pci_pf.external) { n = snprintf(name, len, "c%u", attrs->pci_pf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (attrs->pci_vf.external) { n = snprintf(name, len, "c%u", attrs->pci_vf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (attrs->pci_sf.external) { n = snprintf(name, len, "c%u", attrs->pci_sf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; case DEVLINK_PORT_FLAVOUR_VIRTUAL: return -EOPNOTSUPP; } if (n >= len) return -EINVAL; return 0; } int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { struct devlink_port *devlink_port; /* RTNL mutex is held here which ensures that devlink_port * instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ ASSERT_RTNL(); devlink_port = dev->devlink_port; if (!devlink_port) return -EOPNOTSUPP; return __devlink_port_phys_port_name_get(devlink_port, name, len); } int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid) { struct devlink_port *devlink_port; /* Caller must hold RTNL mutex or reference to dev, which ensures that * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ devlink_port = dev->devlink_port; if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); return 0; }
335 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DAX_H #define _LINUX_DAX_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/radix-tree.h> typedef unsigned long dax_entry_t; struct dax_device; struct gendisk; struct iomap_ops; struct iomap_iter; struct iomap; enum dax_access_mode { DAX_ACCESS, DAX_RECOVERY_WRITE, }; struct dax_operations { /* * direct_access: translate a device-relative * logical-page-offset into an absolute physical pfn. Return the * number of pages available for DAX at that pfn. */ long (*direct_access)(struct dax_device *, pgoff_t, long, enum dax_access_mode, void **, pfn_t *); /* * Validate whether this device is usable as an fsdax backing * device. */ bool (*dax_supported)(struct dax_device *, struct block_device *, int, sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); /* * recovery_write: recover a poisoned range by DAX device driver * capable of clearing poison. */ size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *iter); }; struct dax_holder_operations { /* * notify_failure - notify memory failure into inner holder device * @dax_dev: the dax device which contains the holder * @offset: offset on this dax device where memory failure occurs * @len: length of this memory failure event * @flags: action flags for memory failure handler */ int (*notify_failure)(struct dax_device *dax_dev, u64 offset, u64 len, int mf_flags); }; #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); /* * Check if given mapping is supported by the file / underlying device. */ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { if (!(vma->vm_flags & VM_SYNC)) return true; if (!IS_DAX(file_inode(vma->vm_file))) return false; return dax_synchronous(dax_dev); } #else static inline void *dax_holder(struct dax_device *dax_dev) { return NULL; } static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { return ERR_PTR(-EOPNOTSUPP); } static inline void put_dax(struct dax_device *dax_dev) { } static inline void kill_dax(struct dax_device *dax_dev) { } static inline void dax_write_cache(struct dax_device *dax_dev, bool wc) { } static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) { return false; } static inline bool dax_synchronous(struct dax_device *dax_dev) { return true; } static inline void set_dax_synchronous(struct dax_device *dax_dev) { } static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { return !(vma->vm_flags & VM_SYNC); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { return 0; } #endif void set_dax_nocache(struct dax_device *dax_dev); void set_dax_nomc(struct dax_device *dax_dev); struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops); void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { return 0; } static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops) { return NULL; } static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ #if IS_ENABLED(CONFIG_FS_DAX) int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_folio(struct folio *folio); void dax_unlock_folio(struct folio *folio, dax_entry_t cookie); dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, unsigned long index, struct page **page); void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { return NULL; } static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages) { return NULL; } static inline int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc) { return -EOPNOTSUPP; } static inline dax_entry_t dax_lock_folio(struct folio *folio) { if (IS_DAX(folio->mapping->host)) return ~0UL; return 0; } static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { } static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, unsigned long index, struct page **page) { return 0; } static inline void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie) { } #endif int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); #if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); #else static inline int dax_read_lock(void) { return 0; } static inline void dax_read_unlock(int id) { } #endif /* CONFIG_DAX */ bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, const struct iomap_ops *ops); int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } /* * Due to dax's memory and block duo personalities, hwpoison reporting * takes into consideration which personality is presently visible. * When dax acts like a block device, such as in block IO, an encounter of * dax hwpoison is reported as -EIO. * When dax acts like memory, such as in page fault, a detection of hwpoison * is reported as -EHWPOISON which leads to VM_FAULT_HWPOISON. */ static inline int dax_mem2blk_err(int err) { return (err == -EHWPOISON) ? -EIO : err; } #ifdef CONFIG_DEV_DAX_HMEM_DEVICES void hmem_register_resource(int target_nid, struct resource *r); #else static inline void hmem_register_resource(int target_nid, struct resource *r) { } #endif typedef int (*walk_hmem_fn)(struct device *dev, int target_nid, const struct resource *res); int walk_hmem_resources(struct device *dev, walk_hmem_fn fn); #endif
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0-only /* * pcrypt - Parallel crypto wrapper. * * Copyright (C) 2009 secunet Security Networks AG * Copyright (C) 2009 Steffen Klassert <steffen.klassert@secunet.com> */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kobject.h> #include <linux/cpu.h> #include <crypto/pcrypt.h> static struct padata_instance *pencrypt; static struct padata_instance *pdecrypt; static struct kset *pcrypt_kset; struct pcrypt_instance_ctx { struct crypto_aead_spawn spawn; struct padata_shell *psenc; struct padata_shell *psdec; atomic_t tfm_count; }; struct pcrypt_aead_ctx { struct crypto_aead *child; unsigned int cb_cpu; }; static inline struct pcrypt_instance_ctx *pcrypt_tfm_ictx( struct crypto_aead *tfm) { return aead_instance_ctx(aead_alg_instance(tfm)); } static int pcrypt_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setkey(ctx->child, key, keylen); } static int pcrypt_aead_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setauthsize(ctx->child, authsize); } static void pcrypt_aead_serial(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); aead_request_complete(req->base.data, padata->info); } static void pcrypt_aead_done(void *data, int err) { struct aead_request *req = data; struct pcrypt_request *preq = aead_request_ctx(req); struct padata_priv *padata = pcrypt_request_padata(preq); padata->info = err; padata_do_serial(padata); } static void pcrypt_aead_enc(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_encrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_enc; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) return -EAGAIN; return err; } static void pcrypt_aead_dec(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_decrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_dec; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) return -EAGAIN; return err; } static int pcrypt_aead_init_tfm(struct crypto_aead *tfm) { int cpu, cpu_index; struct aead_instance *inst = aead_alg_instance(tfm); struct pcrypt_instance_ctx *ictx = aead_instance_ctx(inst); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *cipher; cpu_index = (unsigned int)atomic_inc_return(&ictx->tfm_count) % cpumask_weight(cpu_online_mask); ctx->cb_cpu = cpumask_first(cpu_online_mask); for (cpu = 0; cpu < cpu_index; cpu++) ctx->cb_cpu = cpumask_next(ctx->cb_cpu, cpu_online_mask); cipher = crypto_spawn_aead(&ictx->spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_aead_set_reqsize(tfm, sizeof(struct pcrypt_request) + sizeof(struct aead_request) + crypto_aead_reqsize(cipher)); return 0; } static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void pcrypt_free(struct aead_instance *inst) { struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->spawn); padata_free_shell(ctx->psdec); padata_free_shell(ctx->psenc); kfree(inst); } static int pcrypt_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "pcrypt(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); inst->alg.cra_priority = alg->cra_priority + 100; inst->alg.cra_blocksize = alg->cra_blocksize; inst->alg.cra_alignmask = alg->cra_alignmask; return 0; } static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt) { struct pcrypt_instance_ctx *ctx; struct aead_instance *inst; struct aead_alg *alg; u32 mask = crypto_algt_inherited_mask(algt); int err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; err = -ENOMEM; ctx = aead_instance_ctx(inst); ctx->psenc = padata_alloc_shell(pencrypt); if (!ctx->psenc) goto err_free_inst; ctx->psdec = padata_alloc_shell(pdecrypt); if (!ctx->psdec) goto err_free_inst; err = crypto_grab_aead(&ctx->spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->spawn); err = pcrypt_init_instance(aead_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC; inst->alg.ivsize = crypto_aead_alg_ivsize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.base.cra_ctxsize = sizeof(struct pcrypt_aead_ctx); inst->alg.init = pcrypt_aead_init_tfm; inst->alg.exit = pcrypt_aead_exit_tfm; inst->alg.setkey = pcrypt_aead_setkey; inst->alg.setauthsize = pcrypt_aead_setauthsize; inst->alg.encrypt = pcrypt_aead_encrypt; inst->alg.decrypt = pcrypt_aead_decrypt; inst->free = pcrypt_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: pcrypt_free(inst); } return err; } static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AEAD: return pcrypt_create_aead(tmpl, tb, algt); } return -EINVAL; } static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name) { int ret; pinst->kobj.kset = pcrypt_kset; ret = kobject_add(&pinst->kobj, NULL, "%s", name); if (!ret) kobject_uevent(&pinst->kobj, KOBJ_ADD); return ret; } static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) { int ret = -ENOMEM; *pinst = padata_alloc(name); if (!*pinst) return ret; ret = pcrypt_sysfs_add(*pinst, name); if (ret) padata_free(*pinst); return ret; } static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, .module = THIS_MODULE, }; static int __init pcrypt_init(void) { int err = -ENOMEM; pcrypt_kset = kset_create_and_add("pcrypt", NULL, kernel_kobj); if (!pcrypt_kset) goto err; err = pcrypt_init_padata(&pencrypt, "pencrypt"); if (err) goto err_unreg_kset; err = pcrypt_init_padata(&pdecrypt, "pdecrypt"); if (err) goto err_deinit_pencrypt; return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: padata_free(pencrypt); err_unreg_kset: kset_unregister(pcrypt_kset); err: return err; } static void __exit pcrypt_exit(void) { crypto_unregister_template(&pcrypt_tmpl); padata_free(pencrypt); padata_free(pdecrypt); kset_unregister(pcrypt_kset); } subsys_initcall(pcrypt_init); module_exit(pcrypt_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Parallel crypto wrapper"); MODULE_ALIAS_CRYPTO("pcrypt");
145 5 5 5 5 16 9 5 14 15 15 15 15 15 16 16 16 582 581 15 245 232 16 584 118 5 116 245 559 553 9 245 242 5 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 // SPDX-License-Identifier: GPL-2.0 /* * Ldisc rw semaphore * * The ldisc semaphore is semantically a rw_semaphore but which enforces * an alternate policy, namely: * 1) Supports lock wait timeouts * 2) Write waiter has priority * 3) Downgrading is not supported * * Implementation notes: * 1) Upper half of semaphore count is a wait count (differs from rwsem * in that rwsem normalizes the upper half to the wait bias) * 2) Lacks overflow checking * * The generic counting was copied and modified from include/asm-generic/rwsem.h * by Paul Mackerras <paulus@samba.org>. * * The scheduling policy was copied and modified from lib/rwsem.c * Written by David Howells (dhowells@redhat.com). * * This implementation incorporates the write lock stealing work of * Michel Lespinasse <walken@google.com>. * * Copyright (C) 2013 Peter Hurley <peter@hurleysoftware.com> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/tty.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #if BITS_PER_LONG == 64 # define LDSEM_ACTIVE_MASK 0xffffffffL #else # define LDSEM_ACTIVE_MASK 0x0000ffffL #endif #define LDSEM_UNLOCKED 0L #define LDSEM_ACTIVE_BIAS 1L #define LDSEM_WAIT_BIAS (-LDSEM_ACTIVE_MASK-1) #define LDSEM_READ_BIAS LDSEM_ACTIVE_BIAS #define LDSEM_WRITE_BIAS (LDSEM_WAIT_BIAS + LDSEM_ACTIVE_BIAS) struct ldsem_waiter { struct list_head list; struct task_struct *task; }; /* * Initialize an ldsem: */ void __init_ldsem(struct ld_semaphore *sem, const char *name, struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif atomic_long_set(&sem->count, LDSEM_UNLOCKED); sem->wait_readers = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->read_wait); INIT_LIST_HEAD(&sem->write_wait); } static void __ldsem_wake_readers(struct ld_semaphore *sem) { struct ldsem_waiter *waiter, *next; struct task_struct *tsk; long adjust, count; /* * Try to grant read locks to all readers on the read wait list. * Note the 'active part' of the count is incremented by * the number of readers before waking any processes up. */ adjust = sem->wait_readers * (LDSEM_ACTIVE_BIAS - LDSEM_WAIT_BIAS); count = atomic_long_add_return(adjust, &sem->count); do { if (count > 0) break; if (atomic_long_try_cmpxchg(&sem->count, &count, count - adjust)) return; } while (1); list_for_each_entry_safe(waiter, next, &sem->read_wait, list) { tsk = waiter->task; smp_store_release(&waiter->task, NULL); wake_up_process(tsk); put_task_struct(tsk); } INIT_LIST_HEAD(&sem->read_wait); sem->wait_readers = 0; } static inline int writer_trylock(struct ld_semaphore *sem) { /* * Only wake this writer if the active part of the count can be * transitioned from 0 -> 1 */ long count = atomic_long_add_return(LDSEM_ACTIVE_BIAS, &sem->count); do { if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) return 1; if (atomic_long_try_cmpxchg(&sem->count, &count, count - LDSEM_ACTIVE_BIAS)) return 0; } while (1); } static void __ldsem_wake_writer(struct ld_semaphore *sem) { struct ldsem_waiter *waiter; waiter = list_entry(sem->write_wait.next, struct ldsem_waiter, list); wake_up_process(waiter->task); } /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then: * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - the spinlock must be held by the caller * - woken process blocks are discarded from the list after having task zeroed */ static void __ldsem_wake(struct ld_semaphore *sem) { if (!list_empty(&sem->write_wait)) __ldsem_wake_writer(sem); else if (!list_empty(&sem->read_wait)) __ldsem_wake_readers(sem); } static void ldsem_wake(struct ld_semaphore *sem) { unsigned long flags; raw_spin_lock_irqsave(&sem->wait_lock, flags); __ldsem_wake(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * wait for the read lock to be granted */ static struct ld_semaphore __sched * down_read_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; long adjust = -LDSEM_ACTIVE_BIAS + LDSEM_WAIT_BIAS; /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); /* * Try to reverse the lock attempt but if the count has changed * so that reversing fails, check if there are no waiters, * and early-out if not */ do { if (atomic_long_try_cmpxchg(&sem->count, &count, count + adjust)) { count += adjust; break; } if (count > 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; } } while (1); list_add_tail(&waiter.list, &sem->read_wait); sem->wait_readers++; waiter.task = current; get_task_struct(current); /* if there are no active locks, wake the new lock owner(s) */ if ((count & LDSEM_ACTIVE_MASK) == 0) __ldsem_wake(sem); raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!smp_load_acquire(&waiter.task)) break; if (!timeout) break; timeout = schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); if (!timeout) { /* * Lock timed out but check if this task was just * granted lock ownership - if so, pretend there * was no timeout; otherwise, cleanup lock wait. */ raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) { atomic_long_add_return(-LDSEM_WAIT_BIAS, &sem->count); sem->wait_readers--; list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); put_task_struct(waiter.task); return NULL; } raw_spin_unlock_irq(&sem->wait_lock); } return sem; } /* * wait for the write lock to be granted */ static struct ld_semaphore __sched * down_write_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; long adjust = -LDSEM_ACTIVE_BIAS; int locked = 0; /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); /* * Try to reverse the lock attempt but if the count has changed * so that reversing fails, check if the lock is now owned, * and early-out if so. */ do { if (atomic_long_try_cmpxchg(&sem->count, &count, count + adjust)) break; if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) { raw_spin_unlock_irq(&sem->wait_lock); return sem; } } while (1); list_add_tail(&waiter.list, &sem->write_wait); waiter.task = current; set_current_state(TASK_UNINTERRUPTIBLE); for (;;) { if (!timeout) break; raw_spin_unlock_irq(&sem->wait_lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->wait_lock); set_current_state(TASK_UNINTERRUPTIBLE); locked = writer_trylock(sem); if (locked) break; } if (!locked) atomic_long_add_return(-LDSEM_WAIT_BIAS, &sem->count); list_del(&waiter.list); /* * In case of timeout, wake up every reader who gave the right of way * to writer. Prevent separation readers into two groups: * one that helds semaphore and another that sleeps. * (in case of no contention with a writer) */