| 2484 7562 3209 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM maple_tree #if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MM_H #include <linux/tracepoint.h> struct ma_state; TRACE_EVENT(ma_op, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_read, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_write, TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv, void *val), TP_ARGS(fn, mas, piv, val), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(unsigned long, piv) __field(void *, val) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->piv = piv; __entry->val = val; __entry->node = mas->node; ), TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last, (unsigned long) __entry->piv, (void *) __entry->val ) ) #endif /* _TRACE_MM_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 440 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <linux/seq_file.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->icmp6_type; tuple->src.u.icmp.id = hp->icmp6_identifier; tuple->dst.u.icmp.code = hp->icmp6_code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 }; static const u_int8_t noct_valid_new[] = { [ICMPV6_MGM_QUERY - 130] = 1, [ICMPV6_MGM_REPORT - 130] = 1, [ICMPV6_MGM_REDUCTION - 130] = 1, [NDISC_ROUTER_SOLICITATION - 130] = 1, [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, [ICMPV6_MLD2_REPORT - 130] = 1, [ICMPV6_MRDISC_ADV - 130] = 1, [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } static unsigned int *icmpv6_get_timeouts(struct net *net) { return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u8 valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; if (state->pf != NFPROTO_IPV6) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } static noinline_for_stack int nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { u8 hl = ipv6_hdr(skb)->hop_limit; union nf_inet_addr outer_daddr; union { struct nd_opt_hdr nd_opt; struct rd_msg rd_msg; } tmp; const struct nd_opt_hdr *nd_opt; const struct rd_msg *rd_msg; rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); if (!rd_msg) { icmpv6_error_log(skb, state, "short redirect"); return -NF_ACCEPT; } if (rd_msg->icmph.icmp6_code != 0) return NF_ACCEPT; if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); return -NF_ACCEPT; } dataoff += sizeof(*rd_msg); /* warning: rd_msg no longer usable after this call */ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); if (!nd_opt || nd_opt->nd_opt_len == 0) { icmpv6_error_log(skb, state, "redirect without options"); return -NF_ACCEPT; } /* We could call ndisc_parse_options(), but it would need * skb_linearize() and a bit more work. */ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += 8; return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } if (icmp6h->icmp6_type == NDISC_REDIRECT) return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += sizeof(*icmp6h); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { if (!tb[CTA_PROTO_ICMPV6_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); if (tuple->dst.u.icmp.type < 128 || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type - 128]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { if (!tb[CTA_PROTO_ICMPV6_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { if (!tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); } return 0; } static unsigned int icmpv6_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; } else { /* Set default ICMPv6 timeout. */ *timeout = in->timeout; } return 0; } static int icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmpv6_pernet(net); in->timeout = nf_ct_icmpv6_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmpv6_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
| 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/rcupdate.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage) { return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (!map->ops->map_local_storage_charge) return 0; return map->ops->map_local_storage_charge(smap, owner, size); } static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct bpf_map *map = &smap->map; if (map->ops->map_local_storage_uncharge) map->ops->map_local_storage_uncharge(smap, owner, size); } static struct bpf_local_storage __rcu ** owner_storage(struct bpf_local_storage_map *smap, void *owner) { struct bpf_map *map = &smap->map; return map->ops->map_owner_storage_ptr(owner); } static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed_lockless(&selem->snode); } static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); } static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); } struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool swap_uptrs) { struct bpf_local_storage_elem *selem; if (mem_charge(smap, owner, smap->elem_size)) return NULL; selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, __GFP_ZERO, NUMA_NO_NODE); if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); atomic_set(&selem->state, 0); if (value) { /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); if (swap_uptrs) bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); } return selem; } mem_uncharge(smap, owner, smap->elem_size); return NULL; } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; /* * RCU Tasks Trace grace period implies RCU grace period, do * kfree() directly. */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); kfree(local_storage); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, bool reuse_now) { if (!local_storage) return; if (reuse_now) { kfree_rcu(local_storage, rcu); return; } call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_trace_rcu); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); /* * RCU Tasks Trace grace period implies RCU grace period, do * kfree() directly. */ kfree(selem); } void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now) { struct bpf_local_storage_map *smap; smap = rcu_dereference_check(SDATA(selem)->smap, 1); if (reuse_now) { if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); kfree_rcu(selem, rcu); return; } call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); } static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) { struct bpf_local_storage_elem *selem; struct hlist_node *n; /* The "_safe" iteration is needed. * The loop is not removing the selem from the list * but bpf_selem_free will use the selem->rcu_head * which is union-ized with the selem->free_node. */ hlist_for_each_entry_safe(selem, n, list, free_node) bpf_selem_free(selem, reuse_now); } static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, bool free_local_storage, bool pin_owner) { void *owner = local_storage->owner; u32 uncharge = smap->elem_size; if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) return; uncharge += free_local_storage ? sizeof(*local_storage) : 0; mem_uncharge(smap, local_storage->owner, uncharge); local_storage->mem_charge -= uncharge; if (free_local_storage) { local_storage->owner = NULL; /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); } if (pin_owner) refcount_dec(&local_storage->owner_refcnt); } /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, free_local_storage, false); hlist_del_init_rcu(&selem->snode); hlist_add_head(&selem->free_node, free_selem_list); return free_local_storage; } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); local_storage->mem_charge += smap->elem_size; RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; int err; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); err = raw_res_spin_lock_irqsave(&b->lock, flags); if (err) return err; hlist_del_init_rcu(&selem->map_node); raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) { hlist_del_init_rcu(&selem->map_node); } int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b; unsigned long flags; int err; b = select_bucket(smap, local_storage); err = raw_res_spin_lock_irqsave(&b->lock, flags); if (err) return err; hlist_add_head_rcu(&selem->map_node, &b->list); raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, struct bpf_local_storage_elem *selem) { hlist_add_head_rcu(&selem->map_node, &b->list); } /* * Unlink an selem from map and local storage with lock held. * This is the common path used by local storages to delete an selem. */ int bpf_selem_unlink(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; int err; if (in_nmi()) return -EOPNOTSUPP; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return 0; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); if (err) return err; if (likely(selem_linked_to_storage(selem))) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ err = bpf_selem_unlink_map(selem); if (err) goto out; free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &selem_free_list); } out: raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, false); if (free_local_storage) bpf_local_storage_free(local_storage, false); return err; } /* * Unlink an selem from map and local storage with lockless fallback if callers * are racing or rqspinlock returns error. It should only be called by * bpf_local_storage_destroy() or bpf_local_storage_map_free(). */ static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map_bucket *b) { bool in_map_free = !!b, free_storage = false; struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; unsigned long flags; int err, unlink = 0; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); if (smap) { b = b ? : select_bucket(smap, local_storage); err = raw_res_spin_lock_irqsave(&b->lock, flags); if (!err) { /* * Call bpf_obj_free_fields() under b->lock to make sure it is done * exactly once for an selem. Safe to free special fields immediately * as no BPF program should be referencing the selem. */ if (likely(selem_linked_to_map(selem))) { hlist_del_init_rcu(&selem->map_node); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); unlink++; } raw_res_spin_unlock_irqrestore(&b->lock, flags); } /* * Highly unlikely scenario: resource leak * * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing * and both selem belong to the same bucket, if destroy(selem2) acquired * b->lock and block for too long, neither map_free(selem1) and * destroy(selem1) will be able to free the special field associated * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. */ WARN_ON_ONCE(err && in_map_free); if (!err || in_map_free) RCU_INIT_POINTER(SDATA(selem)->smap, NULL); } if (local_storage) { err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); if (!err) { if (likely(selem_linked_to_storage(selem))) { free_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); /* * Okay to skip clearing owner_storage and storage->owner in * destroy() since the owner is going away. No user or bpf * programs should be able to reference it. */ if (smap && in_map_free) bpf_selem_unlink_storage_nolock_misc( selem, smap, local_storage, free_storage, true); hlist_del_init_rcu(&selem->snode); unlink++; } raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } /* * Highly unlikely scenario: memory leak * * When destroy() fails to acqurire local_storage->lock and initializes * selem->local_storage to NULL before any racing map_free() sees the same * selem, no one will free the local storage. */ WARN_ON_ONCE(err && !in_map_free); if (!err || !in_map_free) RCU_INIT_POINTER(selem->local_storage, NULL); } if (unlink != 2) atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); /* * Normally, an selem can be unlinked under local_storage->lock and b->lock, and * then freed after an RCU grace period. However, if destroy() and map_free() are * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free * the selem only after both map_free() and destroy() see the selem. */ if (unlink == 2 || atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) bpf_selem_free(selem, true); if (free_storage) bpf_local_storage_free(local_storage, true); } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { unsigned long flags; int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); if (err) return; if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, u64 map_flags) { if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; return 0; } int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *first_selem) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; struct bpf_local_storage_map_bucket *b; unsigned long flags; int err; err = mem_charge(smap, owner, sizeof(*storage)); if (err) return err; storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), __GFP_ZERO, NUMA_NO_NODE); if (!storage) { err = -ENOMEM; goto uncharge; } INIT_HLIST_HEAD(&storage->list); raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->mem_charge = sizeof(*storage); refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); b = select_bucket(smap, storage); err = raw_res_spin_lock_irqsave(&b->lock, flags); if (err) goto uncharge; bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); /* Publish storage to the owner. * Instead of using any lock of the kernel object (i.e. owner), * cmpxchg will work with any kernel object regardless what * the running context is, bh, irq...etc. * * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) * is protected by the storage->lock. Hence, when freeing * the owner->storage, the storage->lock must be held before * setting owner->storage ptr to NULL. */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map_nolock(first_selem); raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; uncharge: bpf_local_storage_free(storage, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } /* sk cannot be going away because it is linking new elem * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). * Otherwise, it will become a leak (and other memory issues * during map destruction). */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, bool swap_uptrs) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; struct bpf_local_storage_map_bucket *b; HLIST_HEAD(old_selem_free_list); unsigned long flags, b_flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); selem = bpf_selem_alloc(smap, owner, value, swap_uptrs); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem); if (err) { bpf_selem_free(selem, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } return SDATA(selem); } if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { /* Hoping to find an old_sdata to do inline update * such that it can avoid taking the local_storage->lock * and changing the lists. */ old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) return ERR_PTR(err); if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; } } /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs); if (!alloc_selem) return ERR_PTR(-ENOMEM); err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); if (err) goto free_selem; /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { /* A parallel del is happening and local_storage is going * away. It has just been checked before, so very * unlikely. Return instead of retry to keep things * simple. */ err = -EAGAIN; goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); selem = SELEM(old_sdata); goto unlock; } b = select_bucket(smap, local_storage); err = raw_res_spin_lock_irqsave(&b->lock, b_flags); if (err) goto unlock; alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map_nolock(b, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { bpf_selem_unlink_map_nolock(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), &old_selem_free_list); } raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); free_selem: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, true); } return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; spin_lock(&cache->idx_lock); for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { if (cache->idx_usage_counts[i] < min_usage) { min_usage = cache->idx_usage_counts[i]; res = i; /* Found a free cache_idx */ if (!min_usage) break; } } cache->idx_usage_counts[res]++; spin_unlock(&cache->idx_lock); return res; } static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; return 0; } int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; } /* * Destroy local storage when the owner is going away. Caller must uncharge memory * if memory charging is used. */ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the * local_storage->list by the bpf_prog or by the bpf_map's syscall. * * It is racing with bpf_local_storage_map_free() alone * when unlinking elem from the local_storage->list and * the map's bucket->list. */ hlist_for_each_entry_rcu(selem, &local_storage->list, snode) bpf_selem_unlink_nofail(selem, NULL); if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { while (refcount_read(&local_storage->owner_refcnt)) cpu_relax(); /* * Paired with refcount_dec() in bpf_selem_unlink_nofail() * to make sure destroy() sees the correct local_storage->mem_charge. */ smp_mb(); } return local_storage->mem_charge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) { struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; u64 usage = sizeof(*smap); /* The dynamically callocated selems are not counted currently. */ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); return usage; } struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; int err; smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; } for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; free_smap: kvfree(smap->buckets); bpf_map_area_free(smap); return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; unsigned int i; smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(cache, smap->cache_idx); /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU * read sections should be prevented via bpf_map_inc_not_zero. */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map * now. No new selem (of this map) can be added * to the owner->storage or to the map bucket's list. * * The elem of this map can be cleaned up here * or when the storage is freed e.g. * by bpf_sk_storage_free() during __sk_destruct(). */ for (i = 0; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; rcu_read_lock(); /* No one is adding to b->list now */ restart: hlist_for_each_entry_rcu(selem, &b->list, map_node) { bpf_selem_unlink_nofail(selem, b); if (need_resched()) { cond_resched_rcu(); goto restart; } } rcu_read_unlock(); } /* While freeing the storage we may still need to access the map. * * e.g. when bpf_sk_storage_free() has unlinked selem from the map * which then made the above while((selem = ...)) loop * exit immediately. * * However, while freeing the storage one still needs to access the * smap->elem_size to do the uncharging in * bpf_selem_unlink_storage_nolock(). * * Hence, wait another rcu grace period for the storage to be freed. */ synchronize_rcu(); /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */ rcu_barrier_tasks_trace(); rcu_barrier(); kvfree(smap->buckets); bpf_map_area_free(smap); } |
| 17 17 17 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/xprt.h * * Declarations for the RPC transport interface. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #ifndef _LINUX_SUNRPC_XPRT_H #define _LINUX_SUNRPC_XPRT_H #include <linux/uio.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/ktime.h> #include <linux/kref.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> #define RPC_MIN_SLOT_TABLE (2U) #define RPC_DEF_SLOT_TABLE (16U) #define RPC_MAX_SLOT_TABLE_LIMIT (65536U) #define RPC_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE_LIMIT #define RPC_CWNDSHIFT (8U) #define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) #define RPC_INITCWND RPC_CWNDSCALE #define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) #define RPC_GSS_SEQNO_ARRAY_SIZE 3U enum rpc_display_format_t { RPC_DISPLAY_ADDR = 0, RPC_DISPLAY_PORT, RPC_DISPLAY_PROTO, RPC_DISPLAY_HEX_ADDR, RPC_DISPLAY_HEX_PORT, RPC_DISPLAY_NETID, RPC_DISPLAY_MAX, }; struct rpc_task; struct rpc_xprt; struct xprt_class; struct seq_file; struct svc_serv; struct net; #include <linux/lwq.h> /* * This describes a complete RPC request */ struct rpc_rqst { /* * This is the user-visible part */ struct rpc_xprt * rq_xprt; /* RPC client */ struct xdr_buf rq_snd_buf; /* send buffer */ struct xdr_buf rq_rcv_buf; /* recv buffer */ /* * This is the private part */ struct rpc_task * rq_task; /* RPC task data */ struct rpc_cred * rq_cred; /* Bound cred */ __be32 rq_xid; /* request XID */ int rq_cong; /* has incremented xprt->cong */ u32 rq_seqnos[RPC_GSS_SEQNO_ARRAY_SIZE]; /* past gss req seq nos. */ unsigned int rq_seqno_count; /* number of entries in rq_seqnos */ int rq_enc_pages_num; struct page **rq_enc_pages; /* scratch pages for use by gss privacy code */ void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ union { struct list_head rq_list; /* Slot allocation list */ struct rb_node rq_recv; /* Receive queue */ }; struct list_head rq_xmit; /* Send queue */ struct list_head rq_xmit2; /* Send queue */ void *rq_buffer; /* Call XDR encode buffer */ size_t rq_callsize; void *rq_rbuffer; /* Reply XDR decode buffer */ size_t rq_rcvsize; size_t rq_xmit_bytes_sent; /* total bytes sent */ size_t rq_reply_bytes_recvd; /* total reply bytes */ /* received */ struct xdr_buf rq_private_buf; /* The receive buffer * used in the softirq. */ unsigned long rq_majortimeo; /* major timeout alarm */ unsigned long rq_minortimeo; /* minor timeout alarm */ unsigned long rq_timeout; /* Current timeout value */ ktime_t rq_rtt; /* round-trip time */ unsigned int rq_retries; /* # of retries */ unsigned int rq_connect_cookie; /* A cookie used to track the state of the transport connection */ atomic_t rq_pin; /* * Partial send handling */ u32 rq_bytes_sent; /* Bytes we have sent */ ktime_t rq_xtime; /* transmit time stamp */ int rq_ntrans; #if defined(CONFIG_SUNRPC_BACKCHANNEL) struct lwq_node rq_bc_list; /* Callback service list */ unsigned long rq_bc_pa_state; /* Backchannel prealloc state */ struct list_head rq_bc_pa_list; /* Backchannel prealloc list */ #endif /* CONFIG_SUNRPC_BACKCHANEL */ }; #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len static inline int xprt_rqst_add_seqno(struct rpc_rqst *req, u32 seqno) { if (likely(req->rq_seqno_count < RPC_GSS_SEQNO_ARRAY_SIZE)) req->rq_seqno_count++; /* Shift array to make room for the newest element at the beginning */ memmove(&req->rq_seqnos[1], &req->rq_seqnos[0], (RPC_GSS_SEQNO_ARRAY_SIZE - 1) * sizeof(req->rq_seqnos[0])); req->rq_seqnos[0] = seqno; return 0; } /* RPC transport layer security policies */ enum xprtsec_policies { RPC_XPRTSEC_NONE = 0, RPC_XPRTSEC_TLS_ANON, RPC_XPRTSEC_TLS_X509, }; struct xprtsec_parms { enum xprtsec_policies policy; /* authentication material */ key_serial_t cert_serial; key_serial_t privkey_serial; }; struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize); int (*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task); void (*free_slot)(struct rpc_xprt *xprt, struct rpc_rqst *req); void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); int (*get_srcaddr)(struct rpc_xprt *xprt, char *buf, size_t buflen); unsigned short (*get_srcport)(struct rpc_xprt *xprt); int (*buf_alloc)(struct rpc_task *task); void (*buf_free)(struct rpc_task *task); int (*prepare_request)(struct rpc_rqst *req, struct xdr_buf *buf); int (*send_request)(struct rpc_rqst *req); void (*abort_send_request)(struct rpc_rqst *req); void (*wait_for_reply_request)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_request)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); void (*set_connect_timeout)(struct rpc_xprt *xprt, unsigned long connect_timeout, unsigned long reconnect_timeout); void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); int (*enable_swap)(struct rpc_xprt *xprt); void (*disable_swap)(struct rpc_xprt *xprt); void (*inject_disconnect)(struct rpc_xprt *xprt); int (*bc_setup)(struct rpc_xprt *xprt, unsigned int min_reqs); size_t (*bc_maxpayload)(struct rpc_xprt *xprt); unsigned int (*bc_num_slots)(struct rpc_xprt *xprt); void (*bc_free_rqst)(struct rpc_rqst *rqst); void (*bc_destroy)(struct rpc_xprt *xprt, unsigned int max_reqs); }; /* * RPC transport identifiers * * To preserve compatibility with the historical use of raw IP protocol * id's for transport selection, UDP and TCP identifiers are specified * with the previous values. No such restriction exists for new transports, * except that they may not collide with these values (17 and 6, * respectively). */ #define XPRT_TRANSPORT_BC (1 << 31) enum xprt_transports { XPRT_TRANSPORT_UDP = IPPROTO_UDP, XPRT_TRANSPORT_TCP = IPPROTO_TCP, XPRT_TRANSPORT_BC_TCP = IPPROTO_TCP | XPRT_TRANSPORT_BC, XPRT_TRANSPORT_RDMA = 256, XPRT_TRANSPORT_BC_RDMA = XPRT_TRANSPORT_RDMA | XPRT_TRANSPORT_BC, XPRT_TRANSPORT_LOCAL = 257, XPRT_TRANSPORT_TCP_TLS = 258, }; struct rpc_sysfs_xprt; struct rpc_xprt { struct kref kref; /* Reference count */ const struct rpc_xprt_ops *ops; /* transport methods */ unsigned int id; /* transport id */ const struct rpc_timeout *timeout; /* timeout parms */ struct sockaddr_storage addr; /* server address */ size_t addrlen; /* size of server address */ int prot; /* IP protocol */ unsigned long cong; /* current congestion */ unsigned long cwnd; /* congestion window */ size_t max_payload; /* largest RPC payload size, in bytes */ struct rpc_wait_queue binding; /* requests waiting on rpcbind */ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue pending; /* requests in flight */ struct rpc_wait_queue backlog; /* waiting for slot */ struct list_head free; /* free slots */ unsigned int max_reqs; /* max number of slots */ unsigned int min_reqs; /* min number of slots */ unsigned int num_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char resvport : 1, /* use a reserved port */ reuseport : 1; /* reuse port on reconnect */ atomic_t swapper; /* we're swapping over this transport */ unsigned int bind_index; /* bind function index */ /* * Multipath */ struct list_head xprt_switch; /* * Connection of transports */ unsigned long bind_timeout, reestablish_timeout; struct xprtsec_parms xprtsec; unsigned int connect_cookie; /* A cookie that gets bumped every time the transport is reconnected */ /* * Disconnection of idle transports */ struct work_struct task_cleanup; struct timer_list timer; unsigned long last_used, idle_timeout, connect_timeout, max_reconnect_timeout; /* * Send stuff */ atomic_long_t queuelen; spinlock_t transport_lock; /* lock transport info */ spinlock_t reserve_lock; /* lock slot table */ spinlock_t queue_lock; /* send/receive queue lock */ u32 xid; /* Next XID value to use */ struct rpc_task * snd_task; /* Task blocked in send */ struct list_head xmit_queue; /* Send queue */ atomic_long_t xmit_queuelen; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) struct svc_serv *bc_serv; /* The RPC service which will */ /* process the callback */ unsigned int bc_alloc_max; unsigned int bc_alloc_count; /* Total number of preallocs */ atomic_t bc_slot_count; /* Number of allocated slots */ spinlock_t bc_pa_lock; /* Protects the preallocated * items */ struct list_head bc_pa_list; /* List of preallocated * backchannel rpc_rqst's */ #endif /* CONFIG_SUNRPC_BACKCHANNEL */ struct rb_root recv_queue; /* Receive queue */ struct { unsigned long bind_count, /* total number of binds */ connect_count, /* total number of connects */ connect_start, /* connect start timestamp */ connect_time, /* jiffies waiting for connect */ sends, /* how many complete requests */ recvs, /* how many complete requests */ bad_xids, /* lookup_rqst didn't find XID */ max_slots; /* max rpc_slots used */ unsigned long long req_u, /* average requests on the wire */ bklog_u, /* backlog queue utilization */ sending_u, /* send q utilization */ pending_u; /* pend q utilization */ } stat; struct net *xprt_net; netns_tracker ns_tracker; const char *servername; const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *debugfs; /* debugfs directory */ #endif struct rcu_head rcu; const struct xprt_class *xprt_class; struct rpc_sysfs_xprt *xprt_sysfs; bool main; /*mark if this is the 1st transport */ }; #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* * Backchannel flags */ #define RPC_BC_PA_IN_USE 0x0001 /* Preallocated backchannel */ /* buffer in use */ #endif /* CONFIG_SUNRPC_BACKCHANNEL */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) static inline int bc_prealloc(struct rpc_rqst *req) { return test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); } #else static inline int bc_prealloc(struct rpc_rqst *req) { return 0; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ #define XPRT_CREATE_INFINITE_SLOTS (1U) #define XPRT_CREATE_NO_IDLE_TIMEOUT (1U << 1) struct xprt_create { int ident; /* XPRT_TRANSPORT identifier */ struct net * net; struct sockaddr * srcaddr; /* optional local address */ struct sockaddr * dstaddr; /* remote peer address */ size_t addrlen; const char *servername; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ struct rpc_xprt_switch *bc_xps; unsigned int flags; struct xprtsec_parms xprtsec; unsigned long connect_timeout; unsigned long reconnect_timeout; }; struct xprt_class { struct list_head list; int ident; /* XPRT_TRANSPORT identifier */ struct rpc_xprt * (*setup)(struct xprt_create *); struct module *owner; char name[32]; const char * netid[]; }; /* * Generic internal transport functions */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args); void xprt_connect(struct rpc_task *task); unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt); void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to); void xprt_reserve(struct rpc_task *task); void xprt_retry_reserve(struct rpc_task *task); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req); bool xprt_prepare_transmit(struct rpc_task *task); void xprt_request_enqueue_transmit(struct rpc_task *task); int xprt_request_enqueue_receive(struct rpc_task *task); void xprt_request_wait_receive(struct rpc_task *task); void xprt_request_dequeue_xprt(struct rpc_task *task); bool xprt_request_need_retransmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_release(struct rpc_task *task); struct rpc_xprt * xprt_get(struct rpc_xprt *xprt); void xprt_put(struct rpc_xprt *xprt); struct rpc_xprt * xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_req); void xprt_free(struct rpc_xprt *); void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, struct rpc_task *task); bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); void xprt_cleanup_ids(void); static inline int xprt_enable_swap(struct rpc_xprt *xprt) { return xprt->ops->enable_swap(xprt); } static inline void xprt_disable_swap(struct rpc_xprt *xprt) { xprt->ops->disable_swap(xprt); } /* * Transport switch helper functions */ int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); int xprt_find_transport_ident(const char *); void xprt_wait_for_reply_request_def(struct rpc_task *task); void xprt_wait_for_reply_request_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_xprt *xprt); bool xprt_write_space(struct rpc_xprt *xprt); void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); void xprt_update_rtt(struct rpc_task *task); void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_pin_rqst(struct rpc_rqst *req); void xprt_unpin_rqst(struct rpc_rqst *req); void xprt_release_rqst_cong(struct rpc_task *task); bool xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *); void xprt_unlock_connect(struct rpc_xprt *, void *); void xprt_release_write(struct rpc_xprt *, struct rpc_task *); /* * Reserved bit positions in xprt->state */ #define XPRT_LOCKED (0) #define XPRT_CONNECTED (1) #define XPRT_CONNECTING (2) #define XPRT_CLOSE_WAIT (3) #define XPRT_BOUND (4) #define XPRT_BINDING (5) #define XPRT_CLOSING (6) #define XPRT_OFFLINE (7) #define XPRT_REMOVE (8) #define XPRT_CONGESTED (9) #define XPRT_CWND_WAIT (10) #define XPRT_WRITE_SPACE (11) #define XPRT_SND_IS_COOKIE (12) static inline void xprt_set_connected(struct rpc_xprt *xprt) { set_bit(XPRT_CONNECTED, &xprt->state); } static inline void xprt_clear_connected(struct rpc_xprt *xprt) { clear_bit(XPRT_CONNECTED, &xprt->state); } static inline int xprt_connected(struct rpc_xprt *xprt) { return test_bit(XPRT_CONNECTED, &xprt->state); } static inline int xprt_test_and_set_connected(struct rpc_xprt *xprt) { return test_and_set_bit(XPRT_CONNECTED, &xprt->state); } static inline int xprt_test_and_clear_connected(struct rpc_xprt *xprt) { return test_and_clear_bit(XPRT_CONNECTED, &xprt->state); } static inline void xprt_clear_connecting(struct rpc_xprt *xprt) { smp_mb__before_atomic(); clear_bit(XPRT_CONNECTING, &xprt->state); smp_mb__after_atomic(); } static inline int xprt_connecting(struct rpc_xprt *xprt) { return test_bit(XPRT_CONNECTING, &xprt->state); } static inline int xprt_test_and_set_connecting(struct rpc_xprt *xprt) { return test_and_set_bit(XPRT_CONNECTING, &xprt->state); } static inline void xprt_set_bound(struct rpc_xprt *xprt) { test_and_set_bit(XPRT_BOUND, &xprt->state); } static inline int xprt_bound(struct rpc_xprt *xprt) { return test_bit(XPRT_BOUND, &xprt->state); } static inline void xprt_clear_bound(struct rpc_xprt *xprt) { clear_bit(XPRT_BOUND, &xprt->state); } static inline void xprt_clear_binding(struct rpc_xprt *xprt) { smp_mb__before_atomic(); clear_bit(XPRT_BINDING, &xprt->state); smp_mb__after_atomic(); } static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) { return test_and_set_bit(XPRT_BINDING, &xprt->state); } void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps); void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps); void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps); #endif /* _LINUX_SUNRPC_XPRT_H */ |
| 14052 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CLOCK_INLINED_H #define _ASM_X86_CLOCK_INLINED_H #include <asm/tsc.h> struct clocksource; static __always_inline u64 arch_inlined_clocksource_read(struct clocksource *cs) { return (u64)rdtsc_ordered(); } struct clock_event_device; static __always_inline void arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *evt) { native_wrmsrq(MSR_IA32_TSC_DEADLINE, cycles); } #endif |
| 6 5 5 5 5 5 4 4 5 5 5 2 5 4 5 5 5 5 5 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/highmem.h> #include <linux/gfp.h> #include <linux/cpu.h> #include <linux/export.h> #include "rds.h" struct rds_page_remainder { struct page *r_page; unsigned long r_offset; local_lock_t bh_lock; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; /** * rds_page_remainder_alloc - build up regions of a message. * * @scat: Scatter list for message * @bytes: the number of bytes needed. * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. * * If @bytes is at least a full page then this just returns a page from * alloc_page(). * * If @bytes is a partial page then this stores the unused region of the * page in a per-cpu structure. Future partial-page allocations may be * satisfied from that cached region. This lets us waste less memory on * small allocations with minimal complexity. It works because the transmit * path passes read-only page regions down to devices. They hold a page * reference until they are done with the region. */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; struct page *page; int ret; gfp |= __GFP_HIGHMEM; /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); ret = 0; } goto out; } local_bh_disable(); local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); while (1) { /* avoid a tiny region getting stuck by tossing it */ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { rds_stats_inc(s_page_remainder_miss); __free_page(rem->r_page); rem->r_page = NULL; } /* hand out a fragment from the cached page */ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { sg_set_page(scat, rem->r_page, bytes, rem->r_offset); get_page(sg_page(scat)); if (rem->r_offset != 0) rds_stats_inc(s_page_remainder_hit); rem->r_offset += ALIGN(bytes, 8); if (rem->r_offset >= PAGE_SIZE) { __free_page(rem->r_page); rem->r_page = NULL; } ret = 0; break; } /* alloc if there is nothing for us to use */ local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); page = alloc_page(gfp); local_bh_disable(); local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); if (!page) { ret = -ENOMEM; break; } /* did someone race to fill the remainder before us? */ if (rem->r_page) { __free_page(page); continue; } /* otherwise install our page and loop around to alloc */ rem->r_page = page; rem->r_offset = 0; } local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, ret ? 0 : scat->length); return ret; } EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); void rds_page_exit(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct rds_page_remainder *rem; rem = &per_cpu(rds_page_remainders, cpu); rdsdebug("cpu %u\n", cpu); if (rem->r_page) __free_page(rem->r_page); rem->r_page = NULL; } } |
| 36 36 36 36 36 36 36 36 36 36 36 36 39 2 2 39 39 39 39 39 37 37 37 3 39 42 42 42 42 30 30 9 9 9 9 9 9 9 9 28 28 28 22 21 22 22 22 22 10 13 13 13 16 16 13 13 16 5 5 5 5 5 5 9 24 23 24 16 16 16 16 10 3 3 3 3 3 1 3 3 16 17 16 16 16 4 4 16 16 16 16 16 16 12 3 3 3 3 3 3 3 3 3 3 3 5 5 4 3 3 3 8 76 70 69 60 10 4 10 70 1 1 3 3 3 3 3 3 3 3 3 5 5 3 2 2 1 1 1 1 6 5 5 25 25 7 24 10 10 10 10 10 10 10 10 11 10 11 10 10 10 10 17 16 16 15 20 17 1 17 17 17 17 15 15 2 13 13 12 1 12 1 11 8 6 15 15 16 1 15 15 15 14 11 2 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 5 5 5 4 4 2 3 3 3 2 2 13 13 30 24 29 27 1 27 20 25 25 18 7 26 23 22 17 22 21 21 5 5 17 17 4 17 16 17 15 17 10 10 10 31 30 26 10 4 4 7 5 1 4 2 2 3 3 2 2 2 3 3 2 2 13 13 13 13 11 9 8 8 7 7 3 4 3 140 141 1 3 6 11 21 16 2 3 29 2 4 7 4 4 3 3 6 9 13 138 1 31 27 31 30 1 8 8 8 7 6 6 6 5 1 5 5 8 5 4 4 4 1 3 4 5 16 16 3 3 8 192 31 36 7 192 1 8 2 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Routines for driver control interface * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/threads.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/math64.h> #include <linux/sched/signal.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <sound/control.h> #ifdef CONFIG_SND_CTL_DEBUG #define CREATE_TRACE_POINTS #include "control_trace.h" #else #define trace_snd_ctl_put(card, kctl, iname, expected, actual) #endif // Max allocation size for user controls. static int max_user_ctl_alloc_size = 8 * 1024 * 1024; module_param_named(max_user_ctl_alloc_size, max_user_ctl_alloc_size, int, 0444); MODULE_PARM_DESC(max_user_ctl_alloc_size, "Max allocation size for user controls"); #define MAX_CONTROL_COUNT 1028 struct snd_kctl_ioctl { struct list_head list; /* list of all ioctls */ snd_kctl_ioctl_func_t fioctl; }; static DECLARE_RWSEM(snd_ioctl_rwsem); static DECLARE_RWSEM(snd_ctl_layer_rwsem); static LIST_HEAD(snd_control_ioctls); #ifdef CONFIG_COMPAT static LIST_HEAD(snd_control_compat_ioctls); #endif static struct snd_ctl_layer_ops *snd_ctl_layer; static int snd_ctl_remove_locked(struct snd_card *card, struct snd_kcontrol *kcontrol); static int snd_ctl_open(struct inode *inode, struct file *file) { struct snd_card *card; struct snd_ctl_file *ctl; int i, err; err = stream_open(inode, file); if (err < 0) return err; card = snd_lookup_minor_data(iminor(inode), SNDRV_DEVICE_TYPE_CONTROL); if (!card) { err = -ENODEV; goto __error1; } err = snd_card_file_add(card, file); if (err < 0) { err = -ENODEV; goto __error1; } if (!try_module_get(card->module)) { err = -EFAULT; goto __error2; } ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); if (ctl == NULL) { err = -ENOMEM; goto __error; } INIT_LIST_HEAD(&ctl->events); init_waitqueue_head(&ctl->change_sleep); spin_lock_init(&ctl->read_lock); ctl->card = card; for (i = 0; i < SND_CTL_SUBDEV_ITEMS; i++) ctl->preferred_subdevice[i] = -1; ctl->pid = get_pid(task_pid(current)); file->private_data = ctl; scoped_guard(write_lock_irqsave, &card->controls_rwlock) list_add_tail(&ctl->list, &card->ctl_files); snd_card_unref(card); return 0; __error: module_put(card->module); __error2: snd_card_file_remove(card, file); __error1: if (card) snd_card_unref(card); return err; } static void snd_ctl_empty_read_queue(struct snd_ctl_file * ctl) { struct snd_kctl_event *cread; guard(spinlock_irqsave)(&ctl->read_lock); while (!list_empty(&ctl->events)) { cread = snd_kctl_event(ctl->events.next); list_del(&cread->list); kfree(cread); } } static int snd_ctl_release(struct inode *inode, struct file *file) { struct snd_card *card; struct snd_ctl_file *ctl; struct snd_kcontrol *control; unsigned int idx; ctl = file->private_data; file->private_data = NULL; card = ctl->card; scoped_guard(write_lock_irqsave, &card->controls_rwlock) list_del(&ctl->list); scoped_guard(rwsem_write, &card->controls_rwsem) { list_for_each_entry(control, &card->controls, list) for (idx = 0; idx < control->count; idx++) if (control->vd[idx].owner == ctl) control->vd[idx].owner = NULL; } snd_fasync_free(ctl->fasync); snd_ctl_empty_read_queue(ctl); put_pid(ctl->pid); kfree(ctl); module_put(card->module); snd_card_file_remove(card, file); return 0; } /** * snd_ctl_notify - Send notification to user-space for a control change * @card: the card to send notification * @mask: the event mask, SNDRV_CTL_EVENT_* * @id: the ctl element id to send notification * * This function adds an event record with the given id and mask, appends * to the list and wakes up the user-space for notification. This can be * called in the atomic context. */ void snd_ctl_notify(struct snd_card *card, unsigned int mask, struct snd_ctl_elem_id *id) { struct snd_ctl_file *ctl; struct snd_kctl_event *ev; if (snd_BUG_ON(!card || !id)) return; if (card->shutdown) return; guard(read_lock_irqsave)(&card->controls_rwlock); #if IS_ENABLED(CONFIG_SND_MIXER_OSS) card->mixer_oss_change_count++; #endif list_for_each_entry(ctl, &card->ctl_files, list) { if (!ctl->subscribed) continue; scoped_guard(spinlock, &ctl->read_lock) { list_for_each_entry(ev, &ctl->events, list) { if (ev->id.numid == id->numid) { ev->mask |= mask; goto _found; } } ev = kzalloc(sizeof(*ev), GFP_ATOMIC); if (ev) { ev->id = *id; ev->mask = mask; list_add_tail(&ev->list, &ctl->events); } else { dev_err(card->dev, "No memory available to allocate event\n"); } _found: wake_up(&ctl->change_sleep); } snd_kill_fasync(ctl->fasync, SIGIO, POLL_IN); } } EXPORT_SYMBOL(snd_ctl_notify); /** * snd_ctl_notify_one - Send notification to user-space for a control change * @card: the card to send notification * @mask: the event mask, SNDRV_CTL_EVENT_* * @kctl: the pointer with the control instance * @ioff: the additional offset to the control index * * This function calls snd_ctl_notify() and does additional jobs * like LED state changes. */ void snd_ctl_notify_one(struct snd_card *card, unsigned int mask, struct snd_kcontrol *kctl, unsigned int ioff) { struct snd_ctl_elem_id id = kctl->id; struct snd_ctl_layer_ops *lops; id.index += ioff; id.numid += ioff; snd_ctl_notify(card, mask, &id); guard(rwsem_read)(&snd_ctl_layer_rwsem); for (lops = snd_ctl_layer; lops; lops = lops->next) lops->lnotify(card, mask, kctl, ioff); } EXPORT_SYMBOL(snd_ctl_notify_one); /** * snd_ctl_new - create a new control instance with some elements * @kctl: the pointer to store new control instance * @count: the number of elements in this control * @access: the default access flags for elements in this control * @file: given when locking these elements * * Allocates a memory object for a new control instance. The instance has * elements as many as the given number (@count). Each element has given * access permissions (@access). Each element is locked when @file is given. * * Return: 0 on success, error code on failure */ static int snd_ctl_new(struct snd_kcontrol **kctl, unsigned int count, unsigned int access, struct snd_ctl_file *file) { unsigned int idx; if (count == 0 || count > MAX_CONTROL_COUNT) return -EINVAL; *kctl = kzalloc_flex(**kctl, vd, count); if (!*kctl) return -ENOMEM; (*kctl)->count = count; for (idx = 0; idx < count; idx++) { (*kctl)->vd[idx].access = access; (*kctl)->vd[idx].owner = file; } return 0; } /** * snd_ctl_new1 - create a control instance from the template * @ncontrol: the initialization record * @private_data: the private data to set * * Allocates a new struct snd_kcontrol instance and initialize from the given * template. When the access field of ncontrol is 0, it's assumed as * READWRITE access. When the count field is 0, it's assumes as one. * * Return: The pointer of the newly generated instance, or %NULL on failure. */ struct snd_kcontrol *snd_ctl_new1(const struct snd_kcontrol_new *ncontrol, void *private_data) { struct snd_kcontrol *kctl; unsigned int count; unsigned int access; int err; if (snd_BUG_ON(!ncontrol || !ncontrol->info)) return NULL; count = ncontrol->count; if (count == 0) count = 1; access = ncontrol->access; if (access == 0) access = SNDRV_CTL_ELEM_ACCESS_READWRITE; access &= (SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_VOLATILE | SNDRV_CTL_ELEM_ACCESS_INACTIVE | SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE | SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND | SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK | SNDRV_CTL_ELEM_ACCESS_LED_MASK | SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK); err = snd_ctl_new(&kctl, count, access, NULL); if (err < 0) return NULL; /* The 'numid' member is decided when calling snd_ctl_add(). */ kctl->id.iface = ncontrol->iface; kctl->id.device = ncontrol->device; kctl->id.subdevice = ncontrol->subdevice; if (ncontrol->name) { strscpy(kctl->id.name, ncontrol->name, sizeof(kctl->id.name)); if (strcmp(ncontrol->name, kctl->id.name) != 0) pr_warn("ALSA: Control name '%s' truncated to '%s'\n", ncontrol->name, kctl->id.name); } kctl->id.index = ncontrol->index; kctl->info = ncontrol->info; kctl->get = ncontrol->get; kctl->put = ncontrol->put; kctl->tlv.p = ncontrol->tlv.p; kctl->private_value = ncontrol->private_value; kctl->private_data = private_data; return kctl; } EXPORT_SYMBOL(snd_ctl_new1); /** * snd_ctl_free_one - release the control instance * @kcontrol: the control instance * * Releases the control instance created via snd_ctl_new() * or snd_ctl_new1(). * Don't call this after the control was added to the card. */ void snd_ctl_free_one(struct snd_kcontrol *kcontrol) { if (kcontrol) { if (kcontrol->private_free) kcontrol->private_free(kcontrol); kfree(kcontrol); } } EXPORT_SYMBOL(snd_ctl_free_one); static bool snd_ctl_remove_numid_conflict(struct snd_card *card, unsigned int count) { struct snd_kcontrol *kctl; /* Make sure that the ids assigned to the control do not wrap around */ if (card->last_numid >= UINT_MAX - count) card->last_numid = 0; list_for_each_entry(kctl, &card->controls, list) { if (kctl->id.numid < card->last_numid + 1 + count && kctl->id.numid + kctl->count > card->last_numid + 1) { card->last_numid = kctl->id.numid + kctl->count - 1; return true; } } return false; } static int snd_ctl_find_hole(struct snd_card *card, unsigned int count) { unsigned int iter = 100000; while (snd_ctl_remove_numid_conflict(card, count)) { if (--iter == 0) { /* this situation is very unlikely */ dev_err(card->dev, "unable to allocate new control numid\n"); return -ENOMEM; } } return 0; } /* check whether the given id is contained in the given kctl */ static bool elem_id_matches(const struct snd_kcontrol *kctl, const struct snd_ctl_elem_id *id) { return kctl->id.iface == id->iface && kctl->id.device == id->device && kctl->id.subdevice == id->subdevice && !strncmp(kctl->id.name, id->name, sizeof(kctl->id.name)) && kctl->id.index <= id->index && kctl->id.index + kctl->count > id->index; } #ifdef CONFIG_SND_CTL_FAST_LOOKUP /* Compute a hash key for the corresponding ctl id * It's for the name lookup, hence the numid is excluded. * The hash key is bound in LONG_MAX to be used for Xarray key. */ #define MULTIPLIER 37 static unsigned long get_ctl_id_hash(const struct snd_ctl_elem_id *id) { int i; unsigned long h; h = id->iface; h = MULTIPLIER * h + id->device; h = MULTIPLIER * h + id->subdevice; for (i = 0; i < SNDRV_CTL_ELEM_ID_NAME_MAXLEN && id->name[i]; i++) h = MULTIPLIER * h + id->name[i]; h = MULTIPLIER * h + id->index; h &= LONG_MAX; return h; } /* add hash entries to numid and ctl xarray tables */ static void add_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { struct snd_ctl_elem_id id = kcontrol->id; int i; xa_store_range(&card->ctl_numids, kcontrol->id.numid, kcontrol->id.numid + kcontrol->count - 1, kcontrol, GFP_KERNEL); for (i = 0; i < kcontrol->count; i++) { id.index = kcontrol->id.index + i; if (xa_insert(&card->ctl_hash, get_ctl_id_hash(&id), kcontrol, GFP_KERNEL)) { /* skip hash for this entry, noting we had collision */ card->ctl_hash_collision = true; dev_dbg(card->dev, "ctl_hash collision %d:%s:%d\n", id.iface, id.name, id.index); } } } /* remove hash entries that have been added */ static void remove_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { struct snd_ctl_elem_id id = kcontrol->id; struct snd_kcontrol *matched; unsigned long h; int i; for (i = 0; i < kcontrol->count; i++) { xa_erase(&card->ctl_numids, id.numid); h = get_ctl_id_hash(&id); matched = xa_load(&card->ctl_hash, h); if (matched && (matched == kcontrol || elem_id_matches(matched, &id))) xa_erase(&card->ctl_hash, h); id.index++; id.numid++; } } #else /* CONFIG_SND_CTL_FAST_LOOKUP */ static inline void add_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { } static inline void remove_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { } #endif /* CONFIG_SND_CTL_FAST_LOOKUP */ enum snd_ctl_add_mode { CTL_ADD_EXCLUSIVE, CTL_REPLACE, CTL_ADD_ON_REPLACE, }; /* add/replace a new kcontrol object; call with card->controls_rwsem locked */ static int __snd_ctl_add_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, enum snd_ctl_add_mode mode) { struct snd_ctl_elem_id id; unsigned int idx; struct snd_kcontrol *old; int err; lockdep_assert_held_write(&card->controls_rwsem); id = kcontrol->id; if (id.index > UINT_MAX - kcontrol->count) return -EINVAL; old = snd_ctl_find_id(card, &id); if (!old) { if (mode == CTL_REPLACE) return -EINVAL; } else { if (mode == CTL_ADD_EXCLUSIVE) { dev_err(card->dev, "control %i:%i:%i:%s:%i is already present\n", id.iface, id.device, id.subdevice, id.name, id.index); return -EBUSY; } err = snd_ctl_remove_locked(card, old); if (err < 0) return err; } if (snd_ctl_find_hole(card, kcontrol->count) < 0) return -ENOMEM; scoped_guard(write_lock_irq, &card->controls_rwlock) { list_add_tail(&kcontrol->list, &card->controls); card->controls_count += kcontrol->count; kcontrol->id.numid = card->last_numid + 1; card->last_numid += kcontrol->count; } add_hash_entries(card, kcontrol); for (idx = 0; idx < kcontrol->count; idx++) snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_ADD, kcontrol, idx); return 0; } static int snd_ctl_add_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, enum snd_ctl_add_mode mode) { int err = -EINVAL; if (! kcontrol) return err; if (snd_BUG_ON(!card || !kcontrol->info)) goto error; scoped_guard(rwsem_write, &card->controls_rwsem) err = __snd_ctl_add_replace(card, kcontrol, mode); if (err < 0) goto error; return 0; error: snd_ctl_free_one(kcontrol); return err; } /** * snd_ctl_add - add the control instance to the card * @card: the card instance * @kcontrol: the control instance to add * * Adds the control instance created via snd_ctl_new() or * snd_ctl_new1() to the given card. Assigns also an unique * numid used for fast search. * * It frees automatically the control which cannot be added. * * Return: Zero if successful, or a negative error code on failure. * */ int snd_ctl_add(struct snd_card *card, struct snd_kcontrol *kcontrol) { return snd_ctl_add_replace(card, kcontrol, CTL_ADD_EXCLUSIVE); } EXPORT_SYMBOL(snd_ctl_add); /** * snd_ctl_replace - replace the control instance of the card * @card: the card instance * @kcontrol: the control instance to replace * @add_on_replace: add the control if not already added * * Replaces the given control. If the given control does not exist * and the add_on_replace flag is set, the control is added. If the * control exists, it is destroyed first. * * It frees automatically the control which cannot be added or replaced. * * Return: Zero if successful, or a negative error code on failure. */ int snd_ctl_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, bool add_on_replace) { return snd_ctl_add_replace(card, kcontrol, add_on_replace ? CTL_ADD_ON_REPLACE : CTL_REPLACE); } EXPORT_SYMBOL(snd_ctl_replace); static int __snd_ctl_remove(struct snd_card *card, struct snd_kcontrol *kcontrol, bool remove_hash) { unsigned int idx; lockdep_assert_held_write(&card->controls_rwsem); if (snd_BUG_ON(!card || !kcontrol)) return -EINVAL; if (remove_hash) remove_hash_entries(card, kcontrol); scoped_guard(write_lock_irq, &card->controls_rwlock) { list_del(&kcontrol->list); card->controls_count -= kcontrol->count; } for (idx = 0; idx < kcontrol->count; idx++) snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_REMOVE, kcontrol, idx); snd_ctl_free_one(kcontrol); return 0; } static inline int snd_ctl_remove_locked(struct snd_card *card, struct snd_kcontrol *kcontrol) { return __snd_ctl_remove(card, kcontrol, true); } /** * snd_ctl_remove - remove the control from the card and release it * @card: the card instance * @kcontrol: the control instance to remove * * Removes the control from the card and then releases the instance. * You don't need to call snd_ctl_free_one(). * Passing NULL to @kcontrol argument is allowed as noop. * * Return: 0 if successful, or a negative error code on failure. * * Note that this function takes card->controls_rwsem lock internally. */ int snd_ctl_remove(struct snd_card *card, struct snd_kcontrol *kcontrol) { if (!kcontrol) return 0; guard(rwsem_write)(&card->controls_rwsem); return snd_ctl_remove_locked(card, kcontrol); } EXPORT_SYMBOL(snd_ctl_remove); /** * snd_ctl_remove_id - remove the control of the given id and release it * @card: the card instance * @id: the control id to remove * * Finds the control instance with the given id, removes it from the * card list and releases it. * * Return: 0 if successful, or a negative error code on failure. */ int snd_ctl_remove_id(struct snd_card *card, struct snd_ctl_elem_id *id) { struct snd_kcontrol *kctl; guard(rwsem_write)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, id); if (kctl == NULL) return -ENOENT; return snd_ctl_remove_locked(card, kctl); } EXPORT_SYMBOL(snd_ctl_remove_id); /** * snd_ctl_remove_user_ctl - remove and release the unlocked user control * @file: active control handle * @id: the control id to remove * * Finds the control instance with the given id, removes it from the * card list and releases it. * * Return: 0 if successful, or a negative error code on failure. */ static int snd_ctl_remove_user_ctl(struct snd_ctl_file * file, struct snd_ctl_elem_id *id) { struct snd_card *card = file->card; struct snd_kcontrol *kctl; int idx; guard(rwsem_write)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, id); if (kctl == NULL) return -ENOENT; if (!(kctl->vd[0].access & SNDRV_CTL_ELEM_ACCESS_USER)) return -EINVAL; for (idx = 0; idx < kctl->count; idx++) if (kctl->vd[idx].owner != NULL && kctl->vd[idx].owner != file) return -EBUSY; return snd_ctl_remove_locked(card, kctl); } /** * snd_ctl_activate_id - activate/inactivate the control of the given id * @card: the card instance * @id: the control id to activate/inactivate * @active: non-zero to activate * * Finds the control instance with the given id, and activate or * inactivate the control together with notification, if changed. * The given ID data is filled with full information. * * Return: 0 if unchanged, 1 if changed, or a negative error code on failure. */ int snd_ctl_activate_id(struct snd_card *card, struct snd_ctl_elem_id *id, int active) { struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; int ret; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id(card, id); if (kctl == NULL) { ret = -ENOENT; goto unlock; } index_offset = snd_ctl_get_ioff(kctl, id); vd = &kctl->vd[index_offset]; ret = 0; if (active) { if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_INACTIVE)) goto unlock; vd->access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; } else { if (vd->access & SNDRV_CTL_ELEM_ACCESS_INACTIVE) goto unlock; vd->access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; } snd_ctl_build_ioff(id, kctl, index_offset); downgrade_write(&card->controls_rwsem); snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_INFO, kctl, index_offset); up_read(&card->controls_rwsem); return 1; unlock: up_write(&card->controls_rwsem); return ret; } EXPORT_SYMBOL_GPL(snd_ctl_activate_id); /** * snd_ctl_rename_id - replace the id of a control on the card * @card: the card instance * @src_id: the old id * @dst_id: the new id * * Finds the control with the old id from the card, and replaces the * id with the new one. * * The function tries to keep the already assigned numid while replacing * the rest. * * Note that this function should be used only in the card initialization * phase. Calling after the card instantiation may cause issues with * user-space expecting persistent numids. * * Return: Zero if successful, or a negative error code on failure. */ int snd_ctl_rename_id(struct snd_card *card, struct snd_ctl_elem_id *src_id, struct snd_ctl_elem_id *dst_id) { struct snd_kcontrol *kctl; int saved_numid; guard(rwsem_write)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, src_id); if (kctl == NULL) return -ENOENT; saved_numid = kctl->id.numid; remove_hash_entries(card, kctl); kctl->id = *dst_id; kctl->id.numid = saved_numid; add_hash_entries(card, kctl); return 0; } EXPORT_SYMBOL(snd_ctl_rename_id); /** * snd_ctl_rename - rename the control on the card * @card: the card instance * @kctl: the control to rename * @name: the new name * * Renames the specified control on the card to the new name. * * Note that this function takes card->controls_rwsem lock internally. */ void snd_ctl_rename(struct snd_card *card, struct snd_kcontrol *kctl, const char *name) { guard(rwsem_write)(&card->controls_rwsem); remove_hash_entries(card, kctl); if (strscpy(kctl->id.name, name, sizeof(kctl->id.name)) < 0) pr_warn("ALSA: Renamed control new name '%s' truncated to '%s'\n", name, kctl->id.name); add_hash_entries(card, kctl); } EXPORT_SYMBOL(snd_ctl_rename); #ifndef CONFIG_SND_CTL_FAST_LOOKUP static struct snd_kcontrol * snd_ctl_find_numid_slow(struct snd_card *card, unsigned int numid) { struct snd_kcontrol *kctl; guard(read_lock_irqsave)(&card->controls_rwlock); list_for_each_entry(kctl, &card->controls, list) { if (kctl->id.numid <= numid && kctl->id.numid + kctl->count > numid) return kctl; } return NULL; } #endif /* !CONFIG_SND_CTL_FAST_LOOKUP */ /** * snd_ctl_find_numid - find the control instance with the given number-id * @card: the card instance * @numid: the number-id to search * * Finds the control instance with the given number-id from the card. * * Return: The pointer of the instance if found, or %NULL if not. * * Note that this function takes card->controls_rwlock lock internally. */ struct snd_kcontrol *snd_ctl_find_numid(struct snd_card *card, unsigned int numid) { if (snd_BUG_ON(!card || !numid)) return NULL; #ifdef CONFIG_SND_CTL_FAST_LOOKUP return xa_load(&card->ctl_numids, numid); #else return snd_ctl_find_numid_slow(card, numid); #endif } EXPORT_SYMBOL(snd_ctl_find_numid); /** * snd_ctl_find_id - find the control instance with the given id * @card: the card instance * @id: the id to search * * Finds the control instance with the given id from the card. * * Return: The pointer of the instance if found, or %NULL if not. * * Note that this function takes card->controls_rwlock lock internally. */ struct snd_kcontrol *snd_ctl_find_id(struct snd_card *card, const struct snd_ctl_elem_id *id) { struct snd_kcontrol *kctl; if (snd_BUG_ON(!card || !id)) return NULL; if (id->numid != 0) return snd_ctl_find_numid(card, id->numid); #ifdef CONFIG_SND_CTL_FAST_LOOKUP kctl = xa_load(&card->ctl_hash, get_ctl_id_hash(id)); if (kctl && elem_id_matches(kctl, id)) return kctl; if (!card->ctl_hash_collision) return NULL; /* we can rely on only hash table */ #endif /* no matching in hash table - try all as the last resort */ guard(read_lock_irqsave)(&card->controls_rwlock); list_for_each_entry(kctl, &card->controls, list) if (elem_id_matches(kctl, id)) return kctl; return NULL; } EXPORT_SYMBOL(snd_ctl_find_id); static int snd_ctl_card_info(struct snd_card *card, struct snd_ctl_file * ctl, unsigned int cmd, void __user *arg) { struct snd_ctl_card_info *info __free(kfree) = kzalloc(sizeof(*info), GFP_KERNEL); if (! info) return -ENOMEM; scoped_guard(rwsem_read, &snd_ioctl_rwsem) { info->card = card->number; strscpy(info->id, card->id, sizeof(info->id)); strscpy(info->driver, card->driver, sizeof(info->driver)); strscpy(info->name, card->shortname, sizeof(info->name)); strscpy(info->longname, card->longname, sizeof(info->longname)); strscpy(info->mixername, card->mixername, sizeof(info->mixername)); strscpy(info->components, card->components, sizeof(info->components)); } if (copy_to_user(arg, info, sizeof(struct snd_ctl_card_info))) return -EFAULT; return 0; } static int snd_ctl_elem_list(struct snd_card *card, struct snd_ctl_elem_list *list) { struct snd_kcontrol *kctl; struct snd_ctl_elem_id id; unsigned int offset, space, jidx; offset = list->offset; space = list->space; guard(rwsem_read)(&card->controls_rwsem); list->count = card->controls_count; list->used = 0; if (!space) return 0; list_for_each_entry(kctl, &card->controls, list) { if (offset >= kctl->count) { offset -= kctl->count; continue; } for (jidx = offset; jidx < kctl->count; jidx++) { snd_ctl_build_ioff(&id, kctl, jidx); if (copy_to_user(list->pids + list->used, &id, sizeof(id))) return -EFAULT; list->used++; if (!--space) return 0; } offset = 0; } return 0; } static int snd_ctl_elem_list_user(struct snd_card *card, struct snd_ctl_elem_list __user *_list) { struct snd_ctl_elem_list list; int err; if (copy_from_user(&list, _list, sizeof(list))) return -EFAULT; err = snd_ctl_elem_list(card, &list); if (err) return err; if (copy_to_user(_list, &list, sizeof(list))) return -EFAULT; return 0; } /* Check whether the given kctl info is valid */ static int snd_ctl_check_elem_info(struct snd_card *card, const struct snd_ctl_elem_info *info) { static const unsigned int max_value_counts[] = { [SNDRV_CTL_ELEM_TYPE_BOOLEAN] = 128, [SNDRV_CTL_ELEM_TYPE_INTEGER] = 128, [SNDRV_CTL_ELEM_TYPE_ENUMERATED] = 128, [SNDRV_CTL_ELEM_TYPE_BYTES] = 512, [SNDRV_CTL_ELEM_TYPE_IEC958] = 1, [SNDRV_CTL_ELEM_TYPE_INTEGER64] = 64, }; if (info->type < SNDRV_CTL_ELEM_TYPE_BOOLEAN || info->type > SNDRV_CTL_ELEM_TYPE_INTEGER64) { if (card) dev_err(card->dev, "control %i:%i:%i:%s:%i: invalid type %d\n", info->id.iface, info->id.device, info->id.subdevice, info->id.name, info->id.index, info->type); return -EINVAL; } if (info->type == SNDRV_CTL_ELEM_TYPE_ENUMERATED && info->value.enumerated.items == 0) { if (card) dev_err(card->dev, "control %i:%i:%i:%s:%i: zero enum items\n", info->id.iface, info->id.device, info->id.subdevice, info->id.name, info->id.index); return -EINVAL; } if (info->count > max_value_counts[info->type]) { if (card) dev_err(card->dev, "control %i:%i:%i:%s:%i: invalid count %d\n", info->id.iface, info->id.device, info->id.subdevice, info->id.name, info->id.index, info->count); return -EINVAL; } return 0; } /* The capacity of struct snd_ctl_elem_value.value.*/ static const unsigned int value_sizes[] = { [SNDRV_CTL_ELEM_TYPE_BOOLEAN] = sizeof(long), [SNDRV_CTL_ELEM_TYPE_INTEGER] = sizeof(long), [SNDRV_CTL_ELEM_TYPE_ENUMERATED] = sizeof(unsigned int), [SNDRV_CTL_ELEM_TYPE_BYTES] = sizeof(unsigned char), [SNDRV_CTL_ELEM_TYPE_IEC958] = sizeof(struct snd_aes_iec958), [SNDRV_CTL_ELEM_TYPE_INTEGER64] = sizeof(long long), }; /* fill the remaining snd_ctl_elem_value data with the given pattern */ static void fill_remaining_elem_value(struct snd_ctl_elem_value *control, struct snd_ctl_elem_info *info, u32 pattern) { size_t offset = value_sizes[info->type] * info->count; offset = DIV_ROUND_UP(offset, sizeof(u32)); memset32((u32 *)control->value.bytes.data + offset, pattern, sizeof(control->value) / sizeof(u32) - offset); } /* check whether the given integer ctl value is valid */ static int sanity_check_int_value(struct snd_card *card, const struct snd_ctl_elem_value *control, const struct snd_ctl_elem_info *info, int i, bool print_error) { long long lval, lmin, lmax, lstep; u64 rem; switch (info->type) { default: case SNDRV_CTL_ELEM_TYPE_BOOLEAN: lval = control->value.integer.value[i]; lmin = 0; lmax = 1; lstep = 0; break; case SNDRV_CTL_ELEM_TYPE_INTEGER: lval = control->value.integer.value[i]; lmin = info->value.integer.min; lmax = info->value.integer.max; lstep = info->value.integer.step; break; case SNDRV_CTL_ELEM_TYPE_INTEGER64: lval = control->value.integer64.value[i]; lmin = info->value.integer64.min; lmax = info->value.integer64.max; lstep = info->value.integer64.step; break; case SNDRV_CTL_ELEM_TYPE_ENUMERATED: lval = control->value.enumerated.item[i]; lmin = 0; lmax = info->value.enumerated.items - 1; lstep = 0; break; } if (lval < lmin || lval > lmax) { if (print_error) dev_err(card->dev, "control %i:%i:%i:%s:%i: value out of range %lld (%lld/%lld) at count %i\n", control->id.iface, control->id.device, control->id.subdevice, control->id.name, control->id.index, lval, lmin, lmax, i); return -EINVAL; } if (lstep) { div64_u64_rem(lval, lstep, &rem); if (rem) { if (print_error) dev_err(card->dev, "control %i:%i:%i:%s:%i: unaligned value %lld (step %lld) at count %i\n", control->id.iface, control->id.device, control->id.subdevice, control->id.name, control->id.index, lval, lstep, i); return -EINVAL; } } return 0; } /* check whether the all input values are valid for the given elem value */ static int sanity_check_input_values(struct snd_card *card, const struct snd_ctl_elem_value *control, const struct snd_ctl_elem_info *info, bool print_error) { int i, ret; switch (info->type) { case SNDRV_CTL_ELEM_TYPE_BOOLEAN: case SNDRV_CTL_ELEM_TYPE_INTEGER: case SNDRV_CTL_ELEM_TYPE_INTEGER64: case SNDRV_CTL_ELEM_TYPE_ENUMERATED: for (i = 0; i < info->count; i++) { ret = sanity_check_int_value(card, control, info, i, print_error); if (ret < 0) return ret; } break; default: break; } return 0; } /* perform sanity checks to the given snd_ctl_elem_value object */ static int sanity_check_elem_value(struct snd_card *card, const struct snd_ctl_elem_value *control, const struct snd_ctl_elem_info *info, u32 pattern) { size_t offset; int ret; u32 *p; ret = sanity_check_input_values(card, control, info, true); if (ret < 0) return ret; /* check whether the remaining area kept untouched */ offset = value_sizes[info->type] * info->count; offset = DIV_ROUND_UP(offset, sizeof(u32)); p = (u32 *)control->value.bytes.data + offset; for (; offset < sizeof(control->value) / sizeof(u32); offset++, p++) { if (*p != pattern) { ret = -EINVAL; break; } *p = 0; /* clear the checked area */ } return ret; } static int __snd_ctl_elem_info(struct snd_card *card, struct snd_kcontrol *kctl, struct snd_ctl_elem_info *info, struct snd_ctl_file *ctl) { struct snd_kcontrol_volatile *vd; unsigned int index_offset; int result; #ifdef CONFIG_SND_DEBUG info->access = 0; #endif result = kctl->info(kctl, info); if (result >= 0) { snd_BUG_ON(info->access); index_offset = snd_ctl_get_ioff(kctl, &info->id); vd = &kctl->vd[index_offset]; snd_ctl_build_ioff(&info->id, kctl, index_offset); info->access = vd->access; if (vd->owner) { info->access |= SNDRV_CTL_ELEM_ACCESS_LOCK; if (vd->owner == ctl) info->access |= SNDRV_CTL_ELEM_ACCESS_OWNER; info->owner = pid_vnr(vd->owner->pid); } else { info->owner = -1; } if (!snd_ctl_skip_validation(info) && snd_ctl_check_elem_info(card, info) < 0) result = -EINVAL; } return result; } static int snd_ctl_elem_info(struct snd_ctl_file *ctl, struct snd_ctl_elem_info *info) { struct snd_card *card = ctl->card; struct snd_kcontrol *kctl; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &info->id); if (!kctl) return -ENOENT; return __snd_ctl_elem_info(card, kctl, info, ctl); } static int snd_ctl_elem_info_user(struct snd_ctl_file *ctl, struct snd_ctl_elem_info __user *_info) { struct snd_card *card = ctl->card; struct snd_ctl_elem_info info; int result; if (copy_from_user(&info, _info, sizeof(info))) return -EFAULT; result = snd_power_ref_and_wait(card); if (result) return result; result = snd_ctl_elem_info(ctl, &info); snd_power_unref(card); if (result < 0) return result; /* drop internal access flags */ info.access &= ~(SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK| SNDRV_CTL_ELEM_ACCESS_LED_MASK); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return result; } static int snd_ctl_elem_read(struct snd_card *card, struct snd_ctl_elem_value *control) { struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; struct snd_ctl_elem_info info; const u32 pattern = 0xdeadbeef; int ret; guard(rwsem_read)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &control->id); if (!kctl) return -ENOENT; index_offset = snd_ctl_get_ioff(kctl, &control->id); vd = &kctl->vd[index_offset]; if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_READ) || !kctl->get) return -EPERM; snd_ctl_build_ioff(&control->id, kctl, index_offset); #ifdef CONFIG_SND_CTL_DEBUG /* info is needed only for validation */ memset(&info, 0, sizeof(info)); info.id = control->id; ret = __snd_ctl_elem_info(card, kctl, &info, NULL); if (ret < 0) return ret; #endif if (!snd_ctl_skip_validation(&info)) fill_remaining_elem_value(control, &info, pattern); ret = kctl->get(kctl, control); if (ret < 0) return ret; if (!snd_ctl_skip_validation(&info) && sanity_check_elem_value(card, control, &info, pattern) < 0) { dev_err(card->dev, "control %i:%i:%i:%s:%i: access overflow\n", control->id.iface, control->id.device, control->id.subdevice, control->id.name, control->id.index); return -EINVAL; } return 0; } static int snd_ctl_elem_read_user(struct snd_card *card, struct snd_ctl_elem_value __user *_control) { int result; struct snd_ctl_elem_value *control __free(kfree) = memdup_user(_control, sizeof(*control)); if (IS_ERR(control)) return PTR_ERR(control); result = snd_power_ref_and_wait(card); if (result) return result; result = snd_ctl_elem_read(card, control); snd_power_unref(card); if (result < 0) return result; if (copy_to_user(_control, control, sizeof(*control))) return -EFAULT; return result; } #if IS_ENABLED(CONFIG_SND_CTL_DEBUG) static const char *const snd_ctl_elem_iface_names[] = { [SNDRV_CTL_ELEM_IFACE_CARD] = "CARD", [SNDRV_CTL_ELEM_IFACE_HWDEP] = "HWDEP", [SNDRV_CTL_ELEM_IFACE_MIXER] = "MIXER", [SNDRV_CTL_ELEM_IFACE_PCM] = "PCM", [SNDRV_CTL_ELEM_IFACE_RAWMIDI] = "RAWMIDI", [SNDRV_CTL_ELEM_IFACE_TIMER] = "TIMER", [SNDRV_CTL_ELEM_IFACE_SEQUENCER] = "SEQUENCER", }; static int snd_ctl_put_verify(struct snd_card *card, struct snd_kcontrol *kctl, struct snd_ctl_elem_value *control) { struct snd_ctl_elem_value *original = card->value_buf; struct snd_ctl_elem_info info; const char *iname; int ret, retcmp; memset(original, 0, sizeof(*original)); memset(&info, 0, sizeof(info)); ret = kctl->info(kctl, &info); if (ret) return ret; ret = kctl->get(kctl, original); if (ret) return ret; ret = kctl->put(kctl, control); if (ret < 0) return ret; /* Sanitize the new value (control->value) before comparing. */ fill_remaining_elem_value(control, &info, 0); /* With known state for both new and original, do the comparison. */ retcmp = memcmp(&original->value, &control->value, sizeof(original->value)); if (retcmp) retcmp = 1; iname = snd_ctl_elem_iface_names[kctl->id.iface]; trace_snd_ctl_put(&kctl->id, iname, card->number, ret, retcmp); return ret; } static int snd_ctl_put(struct snd_card *card, struct snd_kcontrol *kctl, struct snd_ctl_elem_value *control, unsigned int access) { if ((access & SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK) || (access & SNDRV_CTL_ELEM_ACCESS_VOLATILE)) return kctl->put(kctl, control); return snd_ctl_put_verify(card, kctl, control); } #else static inline int snd_ctl_put(struct snd_card *card, struct snd_kcontrol *kctl, struct snd_ctl_elem_value *control, unsigned int access) { return kctl->put(kctl, control); } #endif static int snd_ctl_elem_write(struct snd_card *card, struct snd_ctl_file *file, struct snd_ctl_elem_value *control) { struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; int result = 0; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &control->id); if (kctl == NULL) { up_write(&card->controls_rwsem); return -ENOENT; } index_offset = snd_ctl_get_ioff(kctl, &control->id); vd = &kctl->vd[index_offset]; if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_WRITE) || kctl->put == NULL || (file && vd->owner && vd->owner != file)) { up_write(&card->controls_rwsem); return -EPERM; } snd_ctl_build_ioff(&control->id, kctl, index_offset); /* validate input values */ if (IS_ENABLED(CONFIG_SND_CTL_INPUT_VALIDATION)) { struct snd_ctl_elem_info info; memset(&info, 0, sizeof(info)); info.id = control->id; result = __snd_ctl_elem_info(card, kctl, &info, NULL); if (!result) result = sanity_check_input_values(card, control, &info, false); } if (!result) result = snd_ctl_put(card, kctl, control, vd->access); if (result < 0) { up_write(&card->controls_rwsem); return result; } if (result > 0) { downgrade_write(&card->controls_rwsem); snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_VALUE, kctl, index_offset); up_read(&card->controls_rwsem); } else { up_write(&card->controls_rwsem); } return 0; } static int snd_ctl_elem_write_user(struct snd_ctl_file *file, struct snd_ctl_elem_value __user *_control) { struct snd_card *card; int result; struct snd_ctl_elem_value *control __free(kfree) = memdup_user(_control, sizeof(*control)); if (IS_ERR(control)) return PTR_ERR(control); card = file->card; result = snd_power_ref_and_wait(card); if (result < 0) return result; result = snd_ctl_elem_write(card, file, control); snd_power_unref(card); if (result < 0) return result; if (copy_to_user(_control, control, sizeof(*control))) return -EFAULT; return result; } static int snd_ctl_elem_lock(struct snd_ctl_file *file, struct snd_ctl_elem_id __user *_id) { struct snd_card *card = file->card; struct snd_ctl_elem_id id; struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; if (copy_from_user(&id, _id, sizeof(id))) return -EFAULT; guard(rwsem_write)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &id); if (!kctl) return -ENOENT; vd = &kctl->vd[snd_ctl_get_ioff(kctl, &id)]; if (vd->owner) return -EBUSY; vd->owner = file; return 0; } static int snd_ctl_elem_unlock(struct snd_ctl_file *file, struct snd_ctl_elem_id __user *_id) { struct snd_card *card = file->card; struct snd_ctl_elem_id id; struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; if (copy_from_user(&id, _id, sizeof(id))) return -EFAULT; guard(rwsem_write)(&card->controls_rwsem); kctl = snd_ctl_find_id(card, &id); if (!kctl) return -ENOENT; vd = &kctl->vd[snd_ctl_get_ioff(kctl, &id)]; if (!vd->owner) return -EINVAL; if (vd->owner != file) return -EPERM; vd->owner = NULL; return 0; } struct user_element { struct snd_ctl_elem_info info; struct snd_card *card; char *elem_data; /* element data */ unsigned long elem_data_size; /* size of element data in bytes */ void *tlv_data; /* TLV data */ unsigned long tlv_data_size; /* TLV data size */ void *priv_data; /* private data (like strings for enumerated type) */ }; // check whether the addition (in bytes) of user ctl element may overflow the limit. static bool check_user_elem_overflow(struct snd_card *card, ssize_t add) { return (ssize_t)card->user_ctl_alloc_size + add > max_user_ctl_alloc_size; } static int snd_ctl_elem_user_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { struct user_element *ue = snd_kcontrol_chip(kcontrol); unsigned int offset; offset = snd_ctl_get_ioff(kcontrol, &uinfo->id); *uinfo = ue->info; snd_ctl_build_ioff(&uinfo->id, kcontrol, offset); return 0; } static int snd_ctl_elem_user_enum_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { struct user_element *ue = snd_kcontrol_chip(kcontrol); const char *names; unsigned int item; unsigned int offset; item = uinfo->value.enumerated.item; offset = snd_ctl_get_ioff(kcontrol, &uinfo->id); *uinfo = ue->info; snd_ctl_build_ioff(&uinfo->id, kcontrol, offset); item = min(item, uinfo->value.enumerated.items - 1); uinfo->value.enumerated.item = item; names = ue->priv_data; for (; item > 0; --item) names += strlen(names) + 1; strscpy(uinfo->value.enumerated.name, names); return 0; } static int snd_ctl_elem_user_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct user_element *ue = snd_kcontrol_chip(kcontrol); unsigned int size = ue->elem_data_size; char *src = ue->elem_data + snd_ctl_get_ioff(kcontrol, &ucontrol->id) * size; memcpy(&ucontrol->value, src, size); return 0; } static int snd_ctl_elem_user_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { int err, change; struct user_element *ue = snd_kcontrol_chip(kcontrol); unsigned int size = ue->elem_data_size; char *dst = ue->elem_data + snd_ctl_get_ioff(kcontrol, &ucontrol->id) * size; err = sanity_check_input_values(ue->card, ucontrol, &ue->info, false); if (err < 0) return err; change = memcmp(&ucontrol->value, dst, size) != 0; if (change) memcpy(dst, &ucontrol->value, size); return change; } /* called in controls_rwsem write lock */ static int replace_user_tlv(struct snd_kcontrol *kctl, unsigned int __user *buf, unsigned int size) { struct user_element *ue = snd_kcontrol_chip(kctl); unsigned int *container; unsigned int mask = 0; int i; int change; lockdep_assert_held_write(&ue->card->controls_rwsem); if (size > 1024 * 128) /* sane value */ return -EINVAL; // does the TLV size change cause overflow? if (check_user_elem_overflow(ue->card, (ssize_t)(size - ue->tlv_data_size))) return -ENOMEM; container = vmemdup_user(buf, size); if (IS_ERR(container)) return PTR_ERR(container); change = ue->tlv_data_size != size; if (!change) change = memcmp(ue->tlv_data, container, size) != 0; if (!change) { kvfree(container); return 0; } if (ue->tlv_data == NULL) { /* Now TLV data is available. */ for (i = 0; i < kctl->count; ++i) kctl->vd[i].access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ; mask = SNDRV_CTL_EVENT_MASK_INFO; } else { ue->card->user_ctl_alloc_size -= ue->tlv_data_size; ue->tlv_data_size = 0; kvfree(ue->tlv_data); } ue->tlv_data = container; ue->tlv_data_size = size; // decremented at private_free. ue->card->user_ctl_alloc_size += size; mask |= SNDRV_CTL_EVENT_MASK_TLV; for (i = 0; i < kctl->count; ++i) snd_ctl_notify_one(ue->card, mask, kctl, i); return change; } static int read_user_tlv(struct snd_kcontrol *kctl, unsigned int __user *buf, unsigned int size) { struct user_element *ue = snd_kcontrol_chip(kctl); if (ue->tlv_data_size == 0 || ue->tlv_data == NULL) return -ENXIO; if (size < ue->tlv_data_size) return -ENOSPC; if (copy_to_user(buf, ue->tlv_data, ue->tlv_data_size)) return -EFAULT; return 0; } static int snd_ctl_elem_user_tlv(struct snd_kcontrol *kctl, int op_flag, unsigned int size, unsigned int __user *buf) { if (op_flag == SNDRV_CTL_TLV_OP_WRITE) return replace_user_tlv(kctl, buf, size); else return read_user_tlv(kctl, buf, size); } /* called in controls_rwsem write lock */ static int snd_ctl_elem_init_enum_names(struct user_element *ue) { char *names, *p; size_t buf_len, name_len; unsigned int i; const uintptr_t user_ptrval = ue->info.value.enumerated.names_ptr; lockdep_assert_held_write(&ue->card->controls_rwsem); buf_len = ue->info.value.enumerated.names_length; if (buf_len > 64 * 1024) return -EINVAL; if (check_user_elem_overflow(ue->card, buf_len)) return -ENOMEM; names = vmemdup_user((const void __user *)user_ptrval, buf_len); if (IS_ERR(names)) return PTR_ERR(names); /* check that there are enough valid names */ p = names; for (i = 0; i < ue->info.value.enumerated.items; ++i) { if (buf_len == 0) { kvfree(names); return -EINVAL; } name_len = strnlen(p, buf_len); if (name_len == 0 || name_len >= 64 || name_len == buf_len) { kvfree(names); return -EINVAL; } p += name_len + 1; buf_len -= name_len + 1; } ue->priv_data = names; ue->info.value.enumerated.names_ptr = 0; // increment the allocation size; decremented again at private_free. ue->card->user_ctl_alloc_size += ue->info.value.enumerated.names_length; return 0; } static size_t compute_user_elem_size(size_t size, unsigned int count) { return sizeof(struct user_element) + size * count; } static void snd_ctl_elem_user_free(struct snd_kcontrol *kcontrol) { struct user_element *ue = snd_kcontrol_chip(kcontrol); // decrement the allocation size. ue->card->user_ctl_alloc_size -= compute_user_elem_size(ue->elem_data_size, kcontrol->count); ue->card->user_ctl_alloc_size -= ue->tlv_data_size; if (ue->priv_data) ue->card->user_ctl_alloc_size -= ue->info.value.enumerated.names_length; kvfree(ue->tlv_data); kvfree(ue->priv_data); kfree(ue); } static int snd_ctl_elem_add(struct snd_ctl_file *file, struct snd_ctl_elem_info *info, int replace) { struct snd_card *card = file->card; struct snd_kcontrol *kctl; unsigned int count; unsigned int access; long private_size; size_t alloc_size; struct user_element *ue; unsigned int offset; int err; if (!*info->id.name) return -EINVAL; if (strnlen(info->id.name, sizeof(info->id.name)) >= sizeof(info->id.name)) return -EINVAL; /* Delete a control to replace them if needed. */ if (replace) { info->id.numid = 0; err = snd_ctl_remove_user_ctl(file, &info->id); if (err) return err; } /* Check the number of elements for this userspace control. */ count = info->owner; if (count == 0) count = 1; if (count > MAX_CONTROL_COUNT) return -EINVAL; /* Arrange access permissions if needed. */ access = info->access; if (access == 0) access = SNDRV_CTL_ELEM_ACCESS_READWRITE; access &= (SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_INACTIVE | SNDRV_CTL_ELEM_ACCESS_TLV_WRITE); /* In initial state, nothing is available as TLV container. */ if (access & SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) access |= SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK; access |= SNDRV_CTL_ELEM_ACCESS_USER; /* * Check information and calculate the size of data specific to * this userspace control. */ /* pass NULL to card for suppressing error messages */ err = snd_ctl_check_elem_info(NULL, info); if (err < 0) return err; /* user-space control doesn't allow zero-size data */ if (info->count < 1) return -EINVAL; private_size = value_sizes[info->type] * info->count; alloc_size = compute_user_elem_size(private_size, count); guard(rwsem_write)(&card->controls_rwsem); if (check_user_elem_overflow(card, alloc_size)) return -ENOMEM; /* * Keep memory object for this userspace control. After passing this * code block, the instance should be freed by snd_ctl_free_one(). * * Note that these elements in this control are locked. */ err = snd_ctl_new(&kctl, count, access, file); if (err < 0) return err; memcpy(&kctl->id, &info->id, sizeof(kctl->id)); ue = kzalloc(alloc_size, GFP_KERNEL); if (!ue) { kfree(kctl); return -ENOMEM; } kctl->private_data = ue; kctl->private_free = snd_ctl_elem_user_free; // increment the allocated size; decremented again at private_free. card->user_ctl_alloc_size += alloc_size; /* Set private data for this userspace control. */ ue->card = card; ue->info = *info; ue->info.access = 0; ue->elem_data = (char *)ue + sizeof(*ue); ue->elem_data_size = private_size; if (ue->info.type == SNDRV_CTL_ELEM_TYPE_ENUMERATED) { err = snd_ctl_elem_init_enum_names(ue); if (err < 0) { snd_ctl_free_one(kctl); return err; } } /* Set callback functions. */ if (info->type == SNDRV_CTL_ELEM_TYPE_ENUMERATED) kctl->info = snd_ctl_elem_user_enum_info; else kctl->info = snd_ctl_elem_user_info; if (access & SNDRV_CTL_ELEM_ACCESS_READ) kctl->get = snd_ctl_elem_user_get; if (access & SNDRV_CTL_ELEM_ACCESS_WRITE) kctl->put = snd_ctl_elem_user_put; if (access & SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) kctl->tlv.c = snd_ctl_elem_user_tlv; /* This function manage to free the instance on failure. */ err = __snd_ctl_add_replace(card, kctl, CTL_ADD_EXCLUSIVE); if (err < 0) { snd_ctl_free_one(kctl); return err; } offset = snd_ctl_get_ioff(kctl, &info->id); snd_ctl_build_ioff(&info->id, kctl, offset); /* * Here we cannot fill any field for the number of elements added by * this operation because there're no specific fields. The usage of * 'owner' field for this purpose may cause any bugs to userspace * applications because the field originally means PID of a process * which locks the element. */ return 0; } static int snd_ctl_elem_add_user(struct snd_ctl_file *file, struct snd_ctl_elem_info __user *_info, int replace) { struct snd_ctl_elem_info info; int err; if (copy_from_user(&info, _info, sizeof(info))) return -EFAULT; err = snd_ctl_elem_add(file, &info, replace); if (err < 0) return err; if (copy_to_user(_info, &info, sizeof(info))) { snd_ctl_remove_user_ctl(file, &info.id); return -EFAULT; } return 0; } static int snd_ctl_elem_remove(struct snd_ctl_file *file, struct snd_ctl_elem_id __user *_id) { struct snd_ctl_elem_id id; if (copy_from_user(&id, _id, sizeof(id))) return -EFAULT; return snd_ctl_remove_user_ctl(file, &id); } static int snd_ctl_subscribe_events(struct snd_ctl_file *file, int __user *ptr) { int subscribe; if (get_user(subscribe, ptr)) return -EFAULT; if (subscribe < 0) { subscribe = file->subscribed; if (put_user(subscribe, ptr)) return -EFAULT; return 0; } if (subscribe) { file->subscribed = 1; return 0; } else if (file->subscribed) { snd_ctl_empty_read_queue(file); file->subscribed = 0; } return 0; } static int call_tlv_handler(struct snd_ctl_file *file, int op_flag, struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id, unsigned int __user *buf, unsigned int size) { static const struct { int op; int perm; } pairs[] = { {SNDRV_CTL_TLV_OP_READ, SNDRV_CTL_ELEM_ACCESS_TLV_READ}, {SNDRV_CTL_TLV_OP_WRITE, SNDRV_CTL_ELEM_ACCESS_TLV_WRITE}, {SNDRV_CTL_TLV_OP_CMD, SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND}, }; struct snd_kcontrol_volatile *vd = &kctl->vd[snd_ctl_get_ioff(kctl, id)]; int i; /* Check support of the request for this element. */ for (i = 0; i < ARRAY_SIZE(pairs); ++i) { if (op_flag == pairs[i].op && (vd->access & pairs[i].perm)) break; } if (i == ARRAY_SIZE(pairs)) return -ENXIO; if (kctl->tlv.c == NULL) return -ENXIO; /* Write and command operations are not allowed for locked element. */ if (op_flag != SNDRV_CTL_TLV_OP_READ && vd->owner != NULL && vd->owner != file) return -EPERM; return kctl->tlv.c(kctl, op_flag, size, buf); } static int read_tlv_buf(struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id, unsigned int __user *buf, unsigned int size) { struct snd_kcontrol_volatile *vd = &kctl->vd[snd_ctl_get_ioff(kctl, id)]; unsigned int len; if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_READ)) return -ENXIO; if (kctl->tlv.p == NULL) return -ENXIO; len = sizeof(unsigned int) * 2 + kctl->tlv.p[1]; if (size < len) return -ENOMEM; if (copy_to_user(buf, kctl->tlv.p, len)) return -EFAULT; return 0; } static int snd_ctl_tlv_ioctl(struct snd_ctl_file *file, struct snd_ctl_tlv __user *buf, int op_flag) { struct snd_ctl_tlv header; unsigned int __user *container; unsigned int container_size; struct snd_kcontrol *kctl; struct snd_ctl_elem_id id; struct snd_kcontrol_volatile *vd; lockdep_assert_held(&file->card->controls_rwsem); if (copy_from_user(&header, buf, sizeof(header))) return -EFAULT; /* In design of control core, numerical ID starts at 1. */ if (header.numid == 0) return -EINVAL; /* At least, container should include type and length fields. */ if (header.length < sizeof(unsigned int) * 2) return -EINVAL; container_size = header.length; container = buf->tlv; kctl = snd_ctl_find_numid(file->card, header.numid); if (kctl == NULL) return -ENOENT; /* Calculate index of the element in this set. */ id = kctl->id; snd_ctl_build_ioff(&id, kctl, header.numid - id.numid); vd = &kctl->vd[snd_ctl_get_ioff(kctl, &id)]; if (vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK) { return call_tlv_handler(file, op_flag, kctl, &id, container, container_size); } else { if (op_flag == SNDRV_CTL_TLV_OP_READ) { return read_tlv_buf(kctl, &id, container, container_size); } } /* Not supported. */ return -ENXIO; } static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_ctl_file *ctl; struct snd_card *card; struct snd_kctl_ioctl *p; void __user *argp = (void __user *)arg; int __user *ip = argp; int err; ctl = file->private_data; card = ctl->card; if (snd_BUG_ON(!card)) return -ENXIO; switch (cmd) { case SNDRV_CTL_IOCTL_PVERSION: return put_user(SNDRV_CTL_VERSION, ip) ? -EFAULT : 0; case SNDRV_CTL_IOCTL_CARD_INFO: return snd_ctl_card_info(card, ctl, cmd, argp); case SNDRV_CTL_IOCTL_ELEM_LIST: return snd_ctl_elem_list_user(card, argp); case SNDRV_CTL_IOCTL_ELEM_INFO: return snd_ctl_elem_info_user(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_READ: return snd_ctl_elem_read_user(card, argp); case SNDRV_CTL_IOCTL_ELEM_WRITE: return snd_ctl_elem_write_user(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_LOCK: return snd_ctl_elem_lock(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_UNLOCK: return snd_ctl_elem_unlock(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_ADD: return snd_ctl_elem_add_user(ctl, argp, 0); case SNDRV_CTL_IOCTL_ELEM_REPLACE: return snd_ctl_elem_add_user(ctl, argp, 1); case SNDRV_CTL_IOCTL_ELEM_REMOVE: return snd_ctl_elem_remove(ctl, argp); case SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS: return snd_ctl_subscribe_events(ctl, ip); case SNDRV_CTL_IOCTL_TLV_READ: err = snd_power_ref_and_wait(card); if (err < 0) return err; scoped_guard(rwsem_read, &card->controls_rwsem) err = snd_ctl_tlv_ioctl(ctl, argp, SNDRV_CTL_TLV_OP_READ); snd_power_unref(card); return err; case SNDRV_CTL_IOCTL_TLV_WRITE: err = snd_power_ref_and_wait(card); if (err < 0) return err; scoped_guard(rwsem_write, &card->controls_rwsem) err = snd_ctl_tlv_ioctl(ctl, argp, SNDRV_CTL_TLV_OP_WRITE); snd_power_unref(card); return err; case SNDRV_CTL_IOCTL_TLV_COMMAND: err = snd_power_ref_and_wait(card); if (err < 0) return err; scoped_guard(rwsem_write, &card->controls_rwsem) err = snd_ctl_tlv_ioctl(ctl, argp, SNDRV_CTL_TLV_OP_CMD); snd_power_unref(card); return err; case SNDRV_CTL_IOCTL_POWER: return -ENOPROTOOPT; case SNDRV_CTL_IOCTL_POWER_STATE: return put_user(SNDRV_CTL_POWER_D0, ip) ? -EFAULT : 0; } guard(rwsem_read)(&snd_ioctl_rwsem); list_for_each_entry(p, &snd_control_ioctls, list) { err = p->fioctl(card, ctl, cmd, arg); if (err != -ENOIOCTLCMD) return err; } dev_dbg(card->dev, "unknown ioctl = 0x%x\n", cmd); return -ENOTTY; } static ssize_t snd_ctl_read(struct file *file, char __user *buffer, size_t count, loff_t * offset) { struct snd_ctl_file *ctl; int err = 0; ssize_t result = 0; ctl = file->private_data; if (snd_BUG_ON(!ctl || !ctl->card)) return -ENXIO; if (!ctl->subscribed) return -EBADFD; if (count < sizeof(struct snd_ctl_event)) return -EINVAL; spin_lock_irq(&ctl->read_lock); while (count >= sizeof(struct snd_ctl_event)) { struct snd_ctl_event ev; struct snd_kctl_event *kev; while (list_empty(&ctl->events)) { wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { err = -EAGAIN; goto __end_lock; } init_waitqueue_entry(&wait, current); add_wait_queue(&ctl->change_sleep, &wait); set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&ctl->read_lock); schedule(); remove_wait_queue(&ctl->change_sleep, &wait); if (ctl->card->shutdown) return -ENODEV; if (signal_pending(current)) return -ERESTARTSYS; spin_lock_irq(&ctl->read_lock); } kev = snd_kctl_event(ctl->events.next); ev.type = SNDRV_CTL_EVENT_ELEM; ev.data.elem.mask = kev->mask; ev.data.elem.id = kev->id; list_del(&kev->list); spin_unlock_irq(&ctl->read_lock); kfree(kev); if (copy_to_user(buffer, &ev, sizeof(struct snd_ctl_event))) { err = -EFAULT; goto __end; } spin_lock_irq(&ctl->read_lock); buffer += sizeof(struct snd_ctl_event); count -= sizeof(struct snd_ctl_event); result += sizeof(struct snd_ctl_event); } __end_lock: spin_unlock_irq(&ctl->read_lock); __end: return result > 0 ? result : err; } static __poll_t snd_ctl_poll(struct file *file, poll_table * wait) { __poll_t mask; struct snd_ctl_file *ctl; ctl = file->private_data; if (!ctl->subscribed) return 0; poll_wait(file, &ctl->change_sleep, wait); mask = 0; if (!list_empty(&ctl->events)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } /* * register the device-specific control-ioctls. * called from each device manager like pcm.c, hwdep.c, etc. */ static int _snd_ctl_register_ioctl(snd_kctl_ioctl_func_t fcn, struct list_head *lists) { struct snd_kctl_ioctl *pn; pn = kzalloc_obj(struct snd_kctl_ioctl); if (pn == NULL) return -ENOMEM; pn->fioctl = fcn; guard(rwsem_write)(&snd_ioctl_rwsem); list_add_tail(&pn->list, lists); return 0; } /** * snd_ctl_register_ioctl - register the device-specific control-ioctls * @fcn: ioctl callback function * * called from each device manager like pcm.c, hwdep.c, etc. * * Return: zero if successful, or a negative error code */ int snd_ctl_register_ioctl(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_register_ioctl(fcn, &snd_control_ioctls); } EXPORT_SYMBOL(snd_ctl_register_ioctl); #ifdef CONFIG_COMPAT /** * snd_ctl_register_ioctl_compat - register the device-specific 32bit compat * control-ioctls * @fcn: ioctl callback function * * Return: zero if successful, or a negative error code */ int snd_ctl_register_ioctl_compat(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_register_ioctl(fcn, &snd_control_compat_ioctls); } EXPORT_SYMBOL(snd_ctl_register_ioctl_compat); #endif /* * de-register the device-specific control-ioctls. */ static int _snd_ctl_unregister_ioctl(snd_kctl_ioctl_func_t fcn, struct list_head *lists) { struct snd_kctl_ioctl *p; if (snd_BUG_ON(!fcn)) return -EINVAL; guard(rwsem_write)(&snd_ioctl_rwsem); list_for_each_entry(p, lists, list) { if (p->fioctl == fcn) { list_del(&p->list); kfree(p); return 0; } } snd_BUG(); return -EINVAL; } /** * snd_ctl_unregister_ioctl - de-register the device-specific control-ioctls * @fcn: ioctl callback function to unregister * * Return: zero if successful, or a negative error code */ int snd_ctl_unregister_ioctl(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_unregister_ioctl(fcn, &snd_control_ioctls); } EXPORT_SYMBOL(snd_ctl_unregister_ioctl); #ifdef CONFIG_COMPAT /** * snd_ctl_unregister_ioctl_compat - de-register the device-specific compat * 32bit control-ioctls * @fcn: ioctl callback function to unregister * * Return: zero if successful, or a negative error code */ int snd_ctl_unregister_ioctl_compat(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_unregister_ioctl(fcn, &snd_control_compat_ioctls); } EXPORT_SYMBOL(snd_ctl_unregister_ioctl_compat); #endif static int snd_ctl_fasync(int fd, struct file * file, int on) { struct snd_ctl_file *ctl; ctl = file->private_data; return snd_fasync_helper(fd, file, on, &ctl->fasync); } /* return the preferred subdevice number if already assigned; * otherwise return -1 */ int snd_ctl_get_preferred_subdevice(struct snd_card *card, int type) { struct snd_ctl_file *kctl; int subdevice = -1; guard(read_lock_irqsave)(&card->controls_rwlock); list_for_each_entry(kctl, &card->ctl_files, list) { if (kctl->pid == task_pid(current)) { subdevice = kctl->preferred_subdevice[type]; if (subdevice != -1) break; } } return subdevice; } EXPORT_SYMBOL_GPL(snd_ctl_get_preferred_subdevice); /* * ioctl32 compat */ #ifdef CONFIG_COMPAT #include "control_compat.c" #else #define snd_ctl_ioctl_compat NULL #endif /* * control layers (audio LED etc.) */ /** * snd_ctl_request_layer - request to use the layer * @module_name: Name of the kernel module (NULL == build-in) * * Return: zero if successful, or an error code when the module cannot be loaded */ int snd_ctl_request_layer(const char *module_name) { struct snd_ctl_layer_ops *lops; if (module_name == NULL) return 0; scoped_guard(rwsem_read, &snd_ctl_layer_rwsem) { for (lops = snd_ctl_layer; lops; lops = lops->next) if (strcmp(lops->module_name, module_name) == 0) return 0; } return request_module(module_name); } EXPORT_SYMBOL_GPL(snd_ctl_request_layer); /** * snd_ctl_register_layer - register new control layer * @lops: operation structure * * The new layer can track all control elements and do additional * operations on top (like audio LED handling). */ void snd_ctl_register_layer(struct snd_ctl_layer_ops *lops) { struct snd_card *card; int card_number; scoped_guard(rwsem_write, &snd_ctl_layer_rwsem) { lops->next = snd_ctl_layer; snd_ctl_layer = lops; } for (card_number = 0; card_number < SNDRV_CARDS; card_number++) { card = snd_card_ref(card_number); if (card) { scoped_guard(rwsem_read, &card->controls_rwsem) lops->lregister(card); snd_card_unref(card); } } } EXPORT_SYMBOL_GPL(snd_ctl_register_layer); /** * snd_ctl_disconnect_layer - disconnect control layer * @lops: operation structure * * It is expected that the information about tracked cards * is freed before this call (the disconnect callback is * not called here). */ void snd_ctl_disconnect_layer(struct snd_ctl_layer_ops *lops) { struct snd_ctl_layer_ops *lops2, *prev_lops2; guard(rwsem_write)(&snd_ctl_layer_rwsem); for (lops2 = snd_ctl_layer, prev_lops2 = NULL; lops2; lops2 = lops2->next) { if (lops2 == lops) { if (!prev_lops2) snd_ctl_layer = lops->next; else prev_lops2->next = lops->next; break; } prev_lops2 = lops2; } } EXPORT_SYMBOL_GPL(snd_ctl_disconnect_layer); /* * INIT PART */ static const struct file_operations snd_ctl_f_ops = { .owner = THIS_MODULE, .read = snd_ctl_read, .open = snd_ctl_open, .release = snd_ctl_release, .poll = snd_ctl_poll, .unlocked_ioctl = snd_ctl_ioctl, .compat_ioctl = snd_ctl_ioctl_compat, .fasync = snd_ctl_fasync, }; /* call lops under rwsems; called from snd_ctl_dev_*() below() */ #define call_snd_ctl_lops(_card, _op) \ do { \ struct snd_ctl_layer_ops *lops; \ guard(rwsem_read)(&(_card)->controls_rwsem); \ guard(rwsem_read)(&snd_ctl_layer_rwsem); \ for (lops = snd_ctl_layer; lops; lops = lops->next) \ lops->_op(_card); \ } while (0) /* * registration of the control device */ static int snd_ctl_dev_register(struct snd_device *device) { struct snd_card *card = device->device_data; int err; err = snd_register_device(SNDRV_DEVICE_TYPE_CONTROL, card, -1, &snd_ctl_f_ops, card, card->ctl_dev); if (err < 0) return err; call_snd_ctl_lops(card, lregister); return 0; } /* * disconnection of the control device */ static int snd_ctl_dev_disconnect(struct snd_device *device) { struct snd_card *card = device->device_data; struct snd_ctl_file *ctl; scoped_guard(read_lock_irqsave, &card->controls_rwlock) { list_for_each_entry(ctl, &card->ctl_files, list) { wake_up(&ctl->change_sleep); snd_kill_fasync(ctl->fasync, SIGIO, POLL_ERR); } } call_snd_ctl_lops(card, ldisconnect); return snd_unregister_device(card->ctl_dev); } /* * free all controls */ static int snd_ctl_dev_free(struct snd_device *device) { struct snd_card *card = device->device_data; struct snd_kcontrol *control; scoped_guard(rwsem_write, &card->controls_rwsem) { while (!list_empty(&card->controls)) { control = snd_kcontrol(card->controls.next); __snd_ctl_remove(card, control, false); } #ifdef CONFIG_SND_CTL_FAST_LOOKUP xa_destroy(&card->ctl_numids); xa_destroy(&card->ctl_hash); #endif } put_device(card->ctl_dev); return 0; } /* * create control core: * called from init.c */ int snd_ctl_create(struct snd_card *card) { static const struct snd_device_ops ops = { .dev_free = snd_ctl_dev_free, .dev_register = snd_ctl_dev_register, .dev_disconnect = snd_ctl_dev_disconnect, }; int err; if (snd_BUG_ON(!card)) return -ENXIO; if (snd_BUG_ON(card->number < 0 || card->number >= SNDRV_CARDS)) return -ENXIO; err = snd_device_alloc(&card->ctl_dev, card); if (err < 0) return err; dev_set_name(card->ctl_dev, "controlC%d", card->number); err = snd_device_new(card, SNDRV_DEV_CONTROL, card, &ops); if (err < 0) put_device(card->ctl_dev); return err; } /* * Frequently used control callbacks/helpers */ /** * snd_ctl_boolean_mono_info - Helper function for a standard boolean info * callback with a mono channel * @kcontrol: the kcontrol instance * @uinfo: info to store * * This is a function that can be used as info callback for a standard * boolean control with a single mono channel. * * Return: Zero (always successful) */ int snd_ctl_boolean_mono_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 1; return 0; } EXPORT_SYMBOL(snd_ctl_boolean_mono_info); /** * snd_ctl_boolean_stereo_info - Helper function for a standard boolean info * callback with stereo two channels * @kcontrol: the kcontrol instance * @uinfo: info to store * * This is a function that can be used as info callback for a standard * boolean control with stereo two channels. * * Return: Zero (always successful) */ int snd_ctl_boolean_stereo_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; uinfo->count = 2; uinfo->value.integer.min = 0; uinfo->value.integer.max = 1; return 0; } EXPORT_SYMBOL(snd_ctl_boolean_stereo_info); /** * snd_ctl_enum_info - fills the info structure for an enumerated control * @info: the structure to be filled * @channels: the number of the control's channels; often one * @items: the number of control values; also the size of @names * @names: an array containing the names of all control values * * Sets all required fields in @info to their appropriate values. * If the control's accessibility is not the default (readable and writable), * the caller has to fill @info->access. * * Return: Zero (always successful) */ int snd_ctl_enum_info(struct snd_ctl_elem_info *info, unsigned int channels, unsigned int items, const char *const names[]) { info->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; info->count = channels; info->value.enumerated.items = items; if (!items) return 0; if (info->value.enumerated.item >= items) info->value.enumerated.item = items - 1; WARN(strlen(names[info->value.enumerated.item]) >= sizeof(info->value.enumerated.name), "ALSA: too long item name '%s'\n", names[info->value.enumerated.item]); strscpy(info->value.enumerated.name, names[info->value.enumerated.item], sizeof(info->value.enumerated.name)); return 0; } EXPORT_SYMBOL(snd_ctl_enum_info); |
| 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 | // SPDX-License-Identifier: GPL-2.0-only /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. * * Authors: Artem Bityutskiy (Битюцкий Артём) * Adrian Hunter */ /* * This file implements UBIFS initialization and VFS superblock operations. Some * initialization stuff which is rather large and complex is placed at * corresponding subsystems, but most of it is here. */ #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/kthread.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/writeback.h> #include "ubifs.h" static int ubifs_default_version_set(const char *val, const struct kernel_param *kp) { int n = 0, ret; ret = kstrtoint(val, 10, &n); if (ret != 0 || n < 4 || n > UBIFS_FORMAT_VERSION) return -EINVAL; return param_set_int(val, kp); } static const struct kernel_param_ops ubifs_default_version_ops = { .set = ubifs_default_version_set, .get = param_get_int, }; int ubifs_default_version = UBIFS_FORMAT_VERSION; module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_version, 0600); /* * Maximum amount of memory we may 'kmalloc()' without worrying that we are * allocating too much. */ #define UBIFS_KMALLOC_OK (128*1024) /* Slab cache for UBIFS inodes */ static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. * @c: UBIFS file-system description object * @inode: the inode to validate * * This is a helper function for 'ubifs_iget()' which validates various fields * of a newly built inode to make sure they contain sane values and prevent * possible vulnerabilities. Returns zero if the inode is all right and * a non-zero error code if not. */ static int validate_inode(struct ubifs_info *c, const struct inode *inode) { int err; const struct ubifs_inode *ui = ubifs_inode(inode); if (inode->i_size > c->max_inode_sz) { ubifs_err(c, "inode is too large (%lld)", (long long)inode->i_size); return 1; } if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { ubifs_err(c, "unknown compression type %d", ui->compr_type); return 2; } if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) return 3; if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) return 4; if (ui->xattr && !S_ISREG(inode->i_mode)) return 5; if (!ubifs_compr_present(c, ui->compr_type)) { ubifs_warn(c, "inode %llu uses '%s' compression, but it was not compiled in", inode->i_ino, ubifs_compr_name(c, ui->compr_type)); } err = dbg_check_dir(c, inode); return err; } struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) { int err; union ubifs_key key; struct ubifs_ino_node *ino; struct ubifs_info *c = sb->s_fs_info; struct inode *inode; struct ubifs_inode *ui; dbg_gen("inode %lu", inum); inode = iget_locked(sb, inum); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(inode) & I_NEW)) return inode; ui = ubifs_inode(inode); ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); if (!ino) { err = -ENOMEM; goto out; } ino_key_init(c, &key, inode->i_ino); err = ubifs_tnc_lookup(c, &key, ino); if (err) goto out_ino; inode->i_flags |= S_NOCMTIME; if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) inode->i_flags |= S_NOATIME; set_nlink(inode, le32_to_cpu(ino->nlink)); i_uid_write(inode, le32_to_cpu(ino->uid)); i_gid_write(inode, le32_to_cpu(ino->gid)); inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec), le32_to_cpu(ino->atime_nsec)); inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec), le32_to_cpu(ino->mtime_nsec)); inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); inode->i_size = le64_to_cpu(ino->size); ui->data_len = le32_to_cpu(ino->data_len); ui->flags = le32_to_cpu(ino->flags); ui->compr_type = le16_to_cpu(ino->compr_type); ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); ui->xattr_size = le32_to_cpu(ino->xattr_size); ui->xattr_names = le32_to_cpu(ino->xattr_names); ui->synced_i_size = ui->ui_size = inode->i_size; ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; err = validate_inode(c, inode); if (err) goto out_invalid; switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; inode->i_op = &ubifs_file_inode_operations; inode->i_fop = &ubifs_file_operations; if (ui->xattr) { ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; } else if (ui->data_len != 0) { err = 10; goto out_invalid; } break; case S_IFDIR: inode->i_op = &ubifs_dir_inode_operations; inode->i_fop = &ubifs_dir_operations; if (ui->data_len != 0) { err = 11; goto out_invalid; } break; case S_IFLNK: inode->i_op = &ubifs_symlink_inode_operations; if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { err = 12; goto out_invalid; } ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; break; case S_IFBLK: case S_IFCHR: { dev_t rdev; union ubifs_dev_desc *dev; ui->data = kmalloc_obj(union ubifs_dev_desc, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } dev = (union ubifs_dev_desc *)ino->data; if (ui->data_len == sizeof(dev->new)) rdev = new_decode_dev(le32_to_cpu(dev->new)); else if (ui->data_len == sizeof(dev->huge)) rdev = huge_decode_dev(le64_to_cpu(dev->huge)); else { err = 13; goto out_invalid; } memcpy(ui->data, ino->data, ui->data_len); inode->i_op = &ubifs_file_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } case S_IFSOCK: case S_IFIFO: inode->i_op = &ubifs_file_inode_operations; init_special_inode(inode, inode->i_mode, 0); if (ui->data_len != 0) { err = 14; goto out_invalid; } break; default: err = 15; goto out_invalid; } kfree(ino); ubifs_set_inode_flags(inode); unlock_new_inode(inode); return inode; out_invalid: ubifs_err(c, "inode %llu validation failed, error %d", inode->i_ino, err); ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); out: ubifs_err(c, "failed to read inode %llu, error %d", inode->i_ino, err); iget_failed(inode); return ERR_PTR(err); } static struct inode *ubifs_alloc_inode(struct super_block *sb) { struct ubifs_inode *ui; ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS); if (!ui) return NULL; memset((void *)ui + sizeof(struct inode), 0, sizeof(struct ubifs_inode) - sizeof(struct inode)); mutex_init(&ui->ui_mutex); init_rwsem(&ui->xattr_sem); spin_lock_init(&ui->ui_lock); return &ui->vfs_inode; }; static void ubifs_free_inode(struct inode *inode) { struct ubifs_inode *ui = ubifs_inode(inode); kfree(ui->data); fscrypt_free_inode(inode); kmem_cache_free(ubifs_inode_slab, ui); } /* * Note, Linux write-back code calls this without 'i_mutex'. */ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); ubifs_assert(c, !ui->xattr); if (is_bad_inode(inode)) return 0; mutex_lock(&ui->ui_mutex); /* * Due to races between write-back forced by budgeting * (see 'sync_some_inodes()') and background write-back, the inode may * have already been synchronized, do not do this again. This might * also happen if it was synchronized in an VFS operation, e.g. * 'ubifs_link()'. */ if (!ui->dirty) { mutex_unlock(&ui->ui_mutex); return 0; } /* * As an optimization, do not write orphan inodes to the media just * because this is not needed. */ dbg_gen("inode %llu, mode %#x, nlink %u", inode->i_ino, (int)inode->i_mode, inode->i_nlink); if (inode->i_nlink) { err = ubifs_jnl_write_inode(c, inode); if (err) ubifs_err(c, "can't write inode %llu, error %d", inode->i_ino, err); else err = dbg_check_inode_size(c, inode, ui->ui_size); } ui->dirty = 0; mutex_unlock(&ui->ui_mutex); ubifs_release_dirty_inode_budget(c, ui); return err; } static int ubifs_drop_inode(struct inode *inode) { int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); return drop; } static void ubifs_evict_inode(struct inode *inode) { int err; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); if (ui->xattr) /* * Extended attribute inode deletions are fully handled in * 'ubifs_removexattr()'. These inodes are special and have * limited usage, so there is nothing to do here. */ goto out; dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto done; if (is_bad_inode(inode)) goto out; ui->ui_size = inode->i_size = 0; err = ubifs_jnl_delete_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a * simple error message is OK here. */ ubifs_err(c, "can't delete inode %llu, error %d", inode->i_ino, err); out: if (ui->dirty) ubifs_release_dirty_inode_budget(c, ui); else { /* We've deleted something - clean the "no space" flags */ c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } done: clear_inode(inode); fscrypt_put_encryption_info(inode); } static void ubifs_dirty_inode(struct inode *inode, int flags) { struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); if (!ui->dirty) { ui->dirty = 1; dbg_gen("inode %llu", inode->i_ino); } } static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; __le32 *uuid = (__le32 *)c->uuid; free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); buf->f_type = UBIFS_SUPER_MAGIC; buf->f_bsize = UBIFS_BLOCK_SIZE; buf->f_blocks = c->block_cnt; buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; if (free > c->report_rp_size) buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; else buf->f_bavail = 0; buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); ubifs_assert(c, buf->f_bfree <= c->block_cnt); return 0; } static int ubifs_show_options(struct seq_file *s, struct dentry *root) { struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) seq_puts(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) seq_puts(s, ",norm_unmount"); if (c->mount_opts.bulk_read == 2) seq_puts(s, ",bulk_read"); else if (c->mount_opts.bulk_read == 1) seq_puts(s, ",no_bulk_read"); if (c->mount_opts.chk_data_crc == 2) seq_puts(s, ",chk_data_crc"); else if (c->mount_opts.chk_data_crc == 1) seq_puts(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { seq_printf(s, ",compr=%s", ubifs_compr_name(c, c->mount_opts.compr_type)); } seq_printf(s, ",assert=%s", ubifs_assert_action_name(c)); seq_printf(s, ",ubi=%d,vol=%d", c->vi.ubi_num, c->vi.vol_id); return 0; } static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; /* * Zero @wait is just an advisory thing to help the file system shove * lots of data into the queues, and there will be the second * '->sync_fs()' call, with non-zero @wait. */ if (!wait) return 0; /* * Synchronize write buffers, because 'ubifs_run_commit()' does not * do this if it waits for an already running commit. */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) return err; } /* * Strictly speaking, it is not necessary to commit the journal here, * synchronizing write-buffers would be enough. But committing makes * UBIFS free space predictions much more accurate, so we want to let * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ err = ubifs_run_commit(c); if (err) return err; return ubi_sync(c->vi.ubi_num); } /** * init_constants_early - initialize UBIFS constants. * @c: UBIFS file-system description object * * This function initialize UBIFS constants which do not need the superblock to * be read. It also checks that the UBI volume satisfies basic UBIFS * requirements. Returns zero in case of success and a negative error code in * case of failure. */ static int init_constants_early(struct ubifs_info *c) { if (c->vi.corrupted) { ubifs_warn(c, "UBI volume is corrupted - read-only mode"); c->ro_media = 1; } if (c->di.ro_mode) { ubifs_msg(c, "read-only UBI device"); c->ro_media = 1; } if (c->vi.vol_type == UBI_STATIC_VOLUME) { ubifs_msg(c, "static UBI volume - read-only mode"); c->ro_media = 1; } c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; c->max_write_size = c->di.max_write_size; c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes", c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_errc(c, "too few LEBs (%d), min. is %d", c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { ubifs_errc(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit", c->max_write_size, c->min_io_size); return -EINVAL; } /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. */ if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; if (c->max_write_size < c->min_io_size) { c->max_write_size = c->min_io_size; c->max_write_shift = c->min_io_shift; } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); /* * Initialize node length ranges which are mostly needed for node * length validation. */ c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ + UBIFS_MAX_HMAC_LEN; c->ranges[UBIFS_SIG_NODE].min_len = UBIFS_SIG_NODE_SZ; c->ranges[UBIFS_SIG_NODE].max_len = c->leb_size - UBIFS_SB_NODE_SZ; c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; c->ranges[UBIFS_ORPH_NODE].min_len = UBIFS_ORPH_NODE_SZ + sizeof(__le64); c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; /* * Minimum indexing node size is amended later when superblock is * read and the key length is known. */ c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; /* * Maximum indexing node size is amended later when superblock is * read and the fanout is known. */ c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; /* * Initialize dead and dark LEB space watermarks. See gc.c for comments * about these values. */ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); /* * Calculate how many bytes would be wasted at the end of LEB if it was * fully filled with data nodes of maximum size. This is used in * calculations when reporting free space. */ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; /* Buffer size for bulk-reads */ c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; if (c->max_bu_buf_len > c->leb_size) c->max_bu_buf_len = c->leb_size; /* Log is ready, preserve one LEB for commits. */ c->min_log_bytes = c->leb_size; return 0; } /** * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. * @c: UBIFS file-system description object * @lnum: LEB the write-buffer was synchronized to * @free: how many free bytes left in this LEB * @pad: how many bytes were padded * * This is a callback function which is called by the I/O unit when the * write-buffer is synchronized. We need this to correctly maintain space * accounting in bud logical eraseblocks. This function returns zero in case of * success and a negative error code in case of failure. * * This function actually belongs to the journal, but we keep it here because * we want to keep it static. */ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) { return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); } /* * init_constants_sb - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after * the superblock has been read. It also checks various UBIFS parameters and * makes sure they are all right. Returns zero in case of success and a * negative error code in case of failure. */ static int init_constants_sb(struct ubifs_info *c) { int tmp, err; long long tmp64; c->main_bytes = (long long)c->main_lebs * c->leb_size; c->max_znode_sz = sizeof(struct ubifs_znode) + c->fanout * sizeof(struct ubifs_zbranch); tmp = ubifs_idx_node_sz(c, 1); c->ranges[UBIFS_IDX_NODE].min_len = tmp; c->min_idx_node_sz = ALIGN(tmp, 8); tmp = ubifs_idx_node_sz(c, c->fanout); c->ranges[UBIFS_IDX_NODE].max_len = tmp; c->max_idx_node_sz = ALIGN(tmp, 8); /* Make sure LEB size is large enough to fit full commit */ tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { ubifs_err(c, "too small LEB size %d, at least %d needed", c->leb_size, tmp); return -EINVAL; } /* * Make sure that the log is large enough to fit reference nodes for * all buds plus one reserved LEB. */ tmp64 = c->max_bud_bytes + c->leb_size - 1; c->max_bud_cnt = div_u64(tmp64, c->leb_size); tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { ubifs_err(c, "too small log %d LEBs, required min. %d LEBs", c->log_lebs, tmp); return -EINVAL; } /* * When budgeting we assume worst-case scenarios when the pages are not * be compressed and direntries are of the maximum size. * * Note, data, which may be stored in inodes is budgeted separately, so * it is not included into 'c->bi.inode_budget'. */ c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; c->bi.inode_budget = UBIFS_INO_NODE_SZ; c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; /* * When the amount of flash space used by buds becomes * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. * The writers are unblocked when the commit is finished. To avoid * writers to be blocked UBIFS initiates background commit in advance, * when number of bud bytes becomes above the limit defined below. */ c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; /* * Ensure minimum journal size. All the bytes in the journal heads are * considered to be used, when calculating the current journal usage. * Consequently, if the journal is too small, UBIFS will treat it as * always full. */ tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; if (c->bg_bud_bytes < tmp64) c->bg_bud_bytes = tmp64; if (c->max_bud_bytes < tmp64 + c->leb_size) c->max_bud_bytes = tmp64 + c->leb_size; err = ubifs_calc_lpt_geom(c); if (err) return err; /* Initialize effective LEB size used in budgeting calculations */ c->idx_leb_size = c->leb_size - c->max_idx_node_sz; return 0; } /* * init_constants_master - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after * the master node has been read. It also checks various UBIFS parameters and * makes sure they are all right. */ static void init_constants_master(struct ubifs_info *c) { long long tmp64; c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * * Subtract the LEB reserved for GC, the LEB which is reserved for * deletions, minimum LEBs for the index, the LEBs which are reserved * for each journal head. */ tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt; tmp64 *= (long long)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; } /** * take_gc_lnum - reserve GC LEB. * @c: UBIFS file-system description object * * This function ensures that the LEB reserved for garbage collection is marked * as "taken" in lprops. We also have to set free space to LEB size and dirty * space to zero, because lprops may contain out-of-date information if the * file-system was un-mounted before it has been committed. This function * returns zero in case of success and a negative error code in case of * failure. */ static int take_gc_lnum(struct ubifs_info *c) { int err; if (c->gc_lnum == -1) { ubifs_err(c, "no LEB for GC"); return -EINVAL; } /* And we have to tell lprops that this LEB is taken */ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, LPROPS_TAKEN, 0, 0); return err; } /** * alloc_wbufs - allocate write-buffers. * @c: UBIFS file-system description object * * This helper function allocates and initializes UBIFS write-buffers. Returns * zero in case of success and %-ENOMEM in case of failure. */ static int alloc_wbufs(struct ubifs_info *c) { int i, err; c->jheads = kzalloc_objs(struct ubifs_jhead, c->jhead_cnt); if (!c->jheads) return -ENOMEM; /* Initialize journal heads */ for (i = 0; i < c->jhead_cnt; i++) { INIT_LIST_HEAD(&c->jheads[i].buds_list); err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); if (err) goto out_wbuf; c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; c->jheads[i].grouped = 1; c->jheads[i].log_hash = ubifs_hash_get_desc(c); if (IS_ERR(c->jheads[i].log_hash)) { err = PTR_ERR(c->jheads[i].log_hash); goto out_log_hash; } } /* * Garbage Collector head does not need to be synchronized by timer. * Also GC head nodes are not grouped. */ c->jheads[GCHD].wbuf.no_timer = 1; c->jheads[GCHD].grouped = 0; return 0; out_log_hash: kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); out_wbuf: while (i--) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; return err; } /** * free_wbufs - free write-buffers. * @c: UBIFS file-system description object */ static void free_wbufs(struct ubifs_info *c) { int i; if (c->jheads) { for (i = 0; i < c->jhead_cnt; i++) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; } } /** * free_orphans - free orphans. * @c: UBIFS file-system description object */ static void free_orphans(struct ubifs_info *c) { struct ubifs_orphan *orph; while (c->orph_dnext) { orph = c->orph_dnext; c->orph_dnext = orph->dnext; list_del(&orph->list); kfree(orph); } while (!list_empty(&c->orph_list)) { orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); ubifs_err(c, "orphan list not empty at unmount"); } vfree(c->orph_buf); c->orph_buf = NULL; } /** * free_buds - free per-bud objects. * @c: UBIFS file-system description object */ static void free_buds(struct ubifs_info *c) { struct ubifs_bud *bud, *n; rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) { kfree(bud->log_hash); kfree(bud); } } /** * check_volume_empty - check if the UBI volume is empty. * @c: UBIFS file-system description object * * This function checks if the UBIFS volume is empty by looking if its LEBs are * mapped or not. The result of checking is stored in the @c->empty variable. * Returns zero in case of success and a negative error code in case of * failure. */ static int check_volume_empty(struct ubifs_info *c) { int lnum, err; c->empty = 1; for (lnum = 0; lnum < c->leb_cnt; lnum++) { err = ubifs_is_mapped(c, lnum); if (unlikely(err < 0)) return err; if (err == 1) { c->empty = 0; break; } cond_resched(); } return 0; } /* * UBIFS mount options. * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting * Opt_bulk_read: enable bulk-reads * Opt_no_bulk_read: disable bulk-reads * Opt_chk_data_crc: check CRCs when reading data nodes * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_override_compr: override default compressor * Opt_assert: set ubifs_assert() action * Opt_auth_key: The key name used for authentication * Opt_auth_hash_name: The hash type used for authentication * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, Opt_bulk_read, Opt_no_bulk_read, Opt_chk_data_crc, Opt_no_chk_data_crc, Opt_override_compr, Opt_assert, Opt_auth_key, Opt_auth_hash_name, Opt_ignore, }; static const struct constant_table ubifs_param_compr[] = { { "none", UBIFS_COMPR_NONE }, { "lzo", UBIFS_COMPR_LZO }, { "zlib", UBIFS_COMPR_ZLIB }, { "zstd", UBIFS_COMPR_ZSTD }, {} }; static const struct constant_table ubifs_param_assert[] = { { "report", ASSACT_REPORT }, { "read-only", ASSACT_RO }, { "panic", ASSACT_PANIC }, {} }; static const struct fs_parameter_spec ubifs_fs_param_spec[] = { fsparam_flag ("fast_unmount", Opt_fast_unmount), fsparam_flag ("norm_unmount", Opt_norm_unmount), fsparam_flag ("bulk_read", Opt_bulk_read), fsparam_flag ("no_bulk_read", Opt_no_bulk_read), fsparam_flag ("chk_data_crc", Opt_chk_data_crc), fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc), fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr), fsparam_enum ("assert", Opt_assert, ubifs_param_assert), fsparam_string ("auth_key", Opt_auth_key), fsparam_string ("auth_hash_name", Opt_auth_hash_name), fsparam_string ("ubi", Opt_ignore), fsparam_string ("vol", Opt_ignore), {} }; struct ubifs_fs_context { struct ubifs_mount_opts mount_opts; char *auth_key_name; char *auth_hash_name; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; unsigned int assert_action:2; }; /** * ubifs_parse_param - parse a parameter. * @fc: the filesystem context * @param: the parameter to parse * * This function parses UBIFS mount options and returns zero in case success * and a negative error code in case of failure. */ static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ubifs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); int opt; opt = fs_parse(fc, ubifs_fs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. * We accept them in order to be backward-compatible. But this * should be removed at some point. */ case Opt_fast_unmount: ctx->mount_opts.unmount_mode = 2; break; case Opt_norm_unmount: ctx->mount_opts.unmount_mode = 1; break; case Opt_bulk_read: ctx->mount_opts.bulk_read = 2; ctx->bulk_read = 1; break; case Opt_no_bulk_read: ctx->mount_opts.bulk_read = 1; ctx->bulk_read = 0; break; case Opt_chk_data_crc: ctx->mount_opts.chk_data_crc = 2; ctx->no_chk_data_crc = 0; break; case Opt_no_chk_data_crc: ctx->mount_opts.chk_data_crc = 1; ctx->no_chk_data_crc = 1; break; case Opt_override_compr: ctx->mount_opts.compr_type = result.uint_32; ctx->mount_opts.override_compr = 1; ctx->default_compr = ctx->mount_opts.compr_type; break; case Opt_assert: ctx->assert_action = result.uint_32; break; case Opt_auth_key: if (!is_remount) { kfree(ctx->auth_key_name); ctx->auth_key_name = param->string; param->string = NULL; } break; case Opt_auth_hash_name: if (!is_remount) { kfree(ctx->auth_hash_name); ctx->auth_hash_name = param->string; param->string = NULL; } break; case Opt_ignore: break; } return 0; } /* * ubifs_release_options - release mount parameters which have been dumped. * @c: UBIFS file-system description object */ static void ubifs_release_options(struct ubifs_info *c) { kfree(c->auth_key_name); c->auth_key_name = NULL; kfree(c->auth_hash_name); c->auth_hash_name = NULL; } /** * destroy_journal - destroy journal data structures. * @c: UBIFS file-system description object * * This function destroys journal data structures including those that may have * been created by recovery functions. */ static void destroy_journal(struct ubifs_info *c) { while (!list_empty(&c->unclean_leb_list)) { struct ubifs_unclean_leb *ucleb; ucleb = list_entry(c->unclean_leb_list.next, struct ubifs_unclean_leb, list); list_del(&ucleb->list); kfree(ucleb); } while (!list_empty(&c->old_buds)) { struct ubifs_bud *bud; bud = list_entry(c->old_buds.next, struct ubifs_bud, list); list_del(&bud->list); kfree(bud->log_hash); kfree(bud); } ubifs_destroy_idx_gc(c); ubifs_destroy_size_tree(c); ubifs_tnc_close(c); free_buds(c); } /** * bu_init - initialize bulk-read information. * @c: UBIFS file-system description object */ static void bu_init(struct ubifs_info *c) { ubifs_assert(c, c->bulk_read == 1); if (c->bu.buf) return; /* Already initialized */ again: c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); if (!c->bu.buf) { if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { c->max_bu_buf_len = UBIFS_KMALLOC_OK; goto again; } /* Just disable bulk-read */ ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it", c->max_bu_buf_len); c->mount_opts.bulk_read = 1; c->bulk_read = 0; return; } } /** * check_free_space - check if there is enough free space to mount. * @c: UBIFS file-system description object * * This function makes sure UBIFS has enough free space to be mounted in * read/write mode. UBIFS must always have some free space to allow deletions. */ static int check_free_space(struct ubifs_info *c) { ubifs_assert(c, c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { ubifs_err(c, "insufficient free space to mount in R/W mode"); ubifs_dump_budg(c, &c->bi); ubifs_dump_lprops(c); return -ENOSPC; } return 0; } /** * mount_ubifs - mount UBIFS file-system. * @c: UBIFS file-system description object * * This function mounts UBIFS file system. Returns zero in case of success and * a negative error code in case of failure. */ static int mount_ubifs(struct ubifs_info *c) { int err; long long x, y; size_t sz; c->ro_mount = !!sb_rdonly(c->vfs_sb); /* Suppress error messages while probing if SB_SILENT is set */ c->probing = !!(c->vfs_sb->s_flags & SB_SILENT); err = init_constants_early(c); if (err) return err; err = ubifs_debugging_init(c); if (err) return err; err = ubifs_sysfs_register(c); if (err) goto out_debugging; err = check_volume_empty(c); if (err) goto out_free; if (c->empty && (c->ro_mount || c->ro_media)) { /* * This UBI volume is empty, and read-only, or the file system * is mounted read-only - we cannot format it. */ ubifs_err(c, "can't format empty UBI volume: read-only %s", c->ro_media ? "UBI volume" : "mount"); err = -EROFS; goto out_free; } if (c->ro_media && !c->ro_mount) { ubifs_err(c, "cannot mount read-write - read-only media"); err = -EROFS; goto out_free; } /* * The requirement for the buffer is that it should fit indexing B-tree * height amount of integers. We assume the height if the TNC tree will * never exceed 64. */ err = -ENOMEM; c->bottom_up_buf = kmalloc_objs(int, BOTTOM_UP_HEIGHT); if (!c->bottom_up_buf) goto out_free; c->sbuf = vmalloc(c->leb_size); if (!c->sbuf) goto out_free; if (!c->ro_mount) { c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) goto out_free; } if (c->bulk_read == 1) bu_init(c); if (!c->ro_mount) { c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) goto out_free; } c->mounting = 1; if (c->auth_key_name) { if (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) { err = ubifs_init_authentication(c); if (err) goto out_free; } else { ubifs_err(c, "auth_key_name, but UBIFS is built without" " authentication support"); err = -EINVAL; goto out_free; } } err = ubifs_read_superblock(c); if (err) goto out_auth; c->probing = 0; /* * Make sure the compressor which is set as default in the superblock * or overridden by mount options is actually compiled in. */ if (!ubifs_compr_present(c, c->default_compr)) { ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c, c->default_compr)); err = -ENOTSUPP; goto out_auth; } err = init_constants_sb(c); if (err) goto out_auth; sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; goto out_auth; } err = alloc_wbufs(c); if (err) goto out_cbuf; sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { /* Create background thread */ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out_wbufs; } } err = ubifs_read_master(c); if (err) goto out_master; init_constants_master(c); if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { ubifs_msg(c, "recovery needed"); c->need_recovery = 1; } if (c->need_recovery && !c->ro_mount) { err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out_master; } err = ubifs_lpt_init(c, 1, !c->ro_mount); if (err) goto out_master; if (!c->ro_mount && c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) goto out_lpt; } if (!c->ro_mount && !c->need_recovery) { /* * Set the "dirty" flag so that if we reboot uncleanly we * will notice this immediately on the next mount. */ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) goto out_lpt; } /* * Handle offline signed images: Now that the master node is * written and its validation no longer depends on the hash * in the superblock, we can update the offline signed * superblock with a HMAC version, */ if (ubifs_authenticated(c) && ubifs_hmac_zero(c, c->sup_node->hmac)) { err = ubifs_hmac_wkm(c, c->sup_node->hmac_wkm); if (err) goto out_lpt; c->superblock_need_write = 1; } if (!c->ro_mount && c->superblock_need_write) { err = ubifs_write_sb_node(c, c->sup_node); if (err) goto out_lpt; c->superblock_need_write = 0; } err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; err = ubifs_replay_journal(c); if (err) goto out_journal; /* Calculate 'min_idx_lebs' after journal replay */ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); if (err) goto out_orphans; if (!c->ro_mount) { int lnum; err = check_free_space(c); if (err) goto out_orphans; /* Check for enough log space */ lnum = c->lhead_lnum + 1; if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) lnum = UBIFS_LOG_LNUM; if (lnum == c->ltail_lnum) { err = ubifs_consolidate_log(c); if (err) goto out_orphans; } if (c->need_recovery) { if (!ubifs_authenticated(c)) { err = ubifs_recover_size(c, true); if (err) goto out_orphans; } err = ubifs_rcvry_gc_commit(c); if (err) goto out_orphans; if (ubifs_authenticated(c)) { err = ubifs_recover_size(c, false); if (err) goto out_orphans; } } else { err = take_gc_lnum(c); if (err) goto out_orphans; /* * GC LEB may contain garbage if there was an unclean * reboot, and it should be un-mapped. */ err = ubifs_leb_unmap(c, c->gc_lnum); if (err) goto out_orphans; } err = dbg_check_lprops(c); if (err) goto out_orphans; } else if (c->need_recovery) { err = ubifs_recover_size(c, false); if (err) goto out_orphans; } else { /* * Even if we mount read-only, we have to set space in GC LEB * to proper value because this affects UBIFS free space * reporting. We do not want to have a situation when * re-mounting from R/O to R/W changes amount of free space. */ err = take_gc_lnum(c); if (err) goto out_orphans; } spin_lock(&ubifs_infos_lock); list_add_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); if (c->need_recovery) { if (c->ro_mount) ubifs_msg(c, "recovery deferred"); else { c->need_recovery = 0; ubifs_msg(c, "recovery completed"); /* * GC LEB has to be empty and taken at this point. But * the journal head LEBs may also be accounted as * "empty taken" if they are empty. */ ubifs_assert(c, c->lst.taken_empty_lebs > 0); } } else ubifs_assert(c, c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) goto out_infos; dbg_debugfs_init_fs(c); c->mounting = 0; ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s", c->vi.ubi_num, c->vi.vol_id, c->vi.name, c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", c->leb_size, c->leb_size >> 10, c->min_io_size, c->max_write_size); ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), max %d LEBs, journal size %lld bytes (%lld MiB, %d LEBs)", x, x >> 20, c->main_lebs, c->max_leb_cnt, y, y >> 20, c->log_lebs + c->max_bud_cnt); ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, c->big_lpt ? ", big LPT model" : ", small LPT model"); dbg_gen("default compressor: %s", ubifs_compr_name(c, c->default_compr)); dbg_gen("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); dbg_gen("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); dbg_gen("LPT area LEBs: %d (%d - %d)", c->lpt_lebs, c->lpt_first, c->lpt_last); dbg_gen("orphan area LEBs: %d (%d - %d)", c->orph_lebs, c->orph_first, c->orph_last); dbg_gen("main area LEBs: %d (%d - %d)", c->main_lebs, c->main_first, c->leb_cnt - 1); dbg_gen("index LEBs: %d", c->lst.idx_lebs); dbg_gen("total index bytes: %llu (%llu KiB, %llu MiB)", c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, c->bi.old_idx_sz >> 20); dbg_gen("key hash type: %d", c->key_hash_type); dbg_gen("tree fanout: %d", c->fanout); dbg_gen("reserved GC LEB: %d", c->gc_lnum); dbg_gen("max. znode size %d", c->max_znode_sz); dbg_gen("max. index node size %d", c->max_idx_node_sz); dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); dbg_gen("node sizes: trun %zu, sb %zu, master %zu", UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_gen("dead watermark: %d", c->dead_wm); dbg_gen("dark watermark: %d", c->dark_wm); dbg_gen("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", c->max_bud_bytes, c->max_bud_bytes >> 10, c->max_bud_bytes >> 20); dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", c->bg_bud_bytes, c->bg_bud_bytes >> 10, c->bg_bud_bytes >> 20); dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); dbg_gen("max. seq. number: %llu", c->max_sqnum); dbg_gen("commit number: %llu", c->cmt_no); dbg_gen("max. xattrs per inode: %d", ubifs_xattr_max_cnt(c)); dbg_gen("max orphans: %d", c->max_orphans); return 0; out_infos: spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); out_orphans: free_orphans(c); out_journal: destroy_journal(c); out_lpt: ubifs_lpt_free(c, 0); out_master: kfree(c->mst_node); kfree(c->rcvrd_mst_node); if (c->bgt) kthread_stop(c->bgt); out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); out_auth: ubifs_exit_authentication(c); out_free: kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_sysfs_unregister(c); out_debugging: ubifs_debugging_exit(c); return err; } /** * ubifs_umount - un-mount UBIFS file-system. * @c: UBIFS file-system description object * * Note, this function is called to free allocated resourced when un-mounting, * as well as free resources when an error occurred while we were half way * through mounting (error path cleanup function). So it has to make sure the * resource was actually allocated before freeing it. */ static void ubifs_umount(struct ubifs_info *c) { dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); dbg_debugfs_exit_fs(c); spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); if (c->bgt) kthread_stop(c->bgt); destroy_journal(c); free_wbufs(c); free_orphans(c); ubifs_lpt_free(c, 0); ubifs_exit_authentication(c); ubifs_release_options(c); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_debugging_exit(c); ubifs_sysfs_unregister(c); } /** * ubifs_remount_rw - re-mount in read-write mode. * @c: UBIFS file-system description object * * UBIFS avoids allocating many unnecessary resources when mounted in read-only * mode. This function allocates the needed resources and re-mounts UBIFS in * read-write mode. */ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; if (c->rw_incompat) { ubifs_err(c, "the file-system is not R/W-compatible"); ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); return -EROFS; } mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; c->ro_mount = 0; if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) goto out; } err = check_free_space(c); if (err) goto out; if (c->need_recovery) { ubifs_msg(c, "completing deferred recovery"); err = ubifs_write_rcvrd_mst_node(c); if (err) goto out; if (!ubifs_authenticated(c)) { err = ubifs_recover_size(c, true); if (err) goto out; } err = ubifs_clean_lebs(c, c->sbuf); if (err) goto out; err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out; } else { /* A readonly mount is not allowed to have orphans */ ubifs_assert(c, c->tot_orphans == 0); err = ubifs_clear_orphans(c); if (err) goto out; } if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) goto out; } if (c->superblock_need_write) { struct ubifs_sb_node *sup = c->sup_node; err = ubifs_write_sb_node(c, sup); if (err) goto out; c->superblock_need_write = 0; } c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) { err = -ENOMEM; goto out; } c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) { err = -ENOMEM; goto out; } err = ubifs_lpt_init(c, 0, 1); if (err) goto out; /* Create background thread */ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out; } c->orph_buf = vmalloc(c->leb_size); if (!c->orph_buf) { err = -ENOMEM; goto out; } /* Check for enough log space */ lnum = c->lhead_lnum + 1; if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) lnum = UBIFS_LOG_LNUM; if (lnum == c->ltail_lnum) { err = ubifs_consolidate_log(c); if (err) goto out; } if (c->need_recovery) { err = ubifs_rcvry_gc_commit(c); if (err) goto out; if (ubifs_authenticated(c)) { err = ubifs_recover_size(c, false); if (err) goto out; } } else { err = ubifs_leb_unmap(c, c->gc_lnum); } if (err) goto out; dbg_gen("re-mounted read-write"); c->remounting_rw = 0; if (c->need_recovery) { c->need_recovery = 0; ubifs_msg(c, "deferred recovery completed"); } else { /* * Do not run the debugging space check if the were doing * recovery, because when we saved the information we had the * file-system in a state where the TNC and lprops has been * modified in memory, but all the I/O operations (including a * commit) were deferred. So the file-system was in * "non-committed" state. Now the file-system is in committed * state, and of course the amount of free space will change * because, for example, the old index size was imprecise. */ err = dbg_check_space_info(c); } mutex_unlock(&c->umount_mutex); return err; out: c->ro_mount = 1; vfree(c->orph_buf); c->orph_buf = NULL; if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; mutex_unlock(&c->umount_mutex); return err; } /** * ubifs_remount_ro - re-mount in read-only mode. * @c: UBIFS file-system description object * * We assume VFS has stopped writing. Possibly the background thread could be * running a commit, however kthread_stop will wait in that case. */ static void ubifs_remount_ro(struct ubifs_info *c) { int i, err; ubifs_assert(c, !c->need_recovery); ubifs_assert(c, !c->ro_mount); mutex_lock(&c->umount_mutex); if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } dbg_save_space_info(c); for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) ubifs_ro_mode(c, err); } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); err = ubifs_write_master(c); if (err) ubifs_ro_mode(c, err); vfree(c->orph_buf); c->orph_buf = NULL; kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->ro_mount = 1; err = dbg_check_space_info(c); if (err) ubifs_ro_mode(c, err); mutex_unlock(&c->umount_mutex); } static void ubifs_put_super(struct super_block *sb) { int i; struct ubifs_info *c = sb->s_fs_info; ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num); /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed * to write them back because of I/O errors. */ if (!c->ro_error) { ubifs_assert(c, c->bi.idx_growth == 0); ubifs_assert(c, c->bi.dd_growth == 0); ubifs_assert(c, c->bi.data_growth == 0); } /* * The 'c->umount_lock' prevents races between UBIFS memory shrinker * and file system un-mount. Namely, it prevents the shrinker from * picking this superblock for shrinking - it will be just skipped if * the mutex is locked. */ mutex_lock(&c->umount_mutex); if (!c->ro_mount) { /* * First of all kill the background thread to make sure it does * not interfere with un-mounting and freeing resources. */ if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } /* * On fatal errors c->ro_error is set to 1, in which case we do * not write the master node. */ if (!c->ro_error) { int err; /* Synchronize write-buffers */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) ubifs_ro_mode(c, err); } /* * We are being cleanly unmounted which means the * orphans were killed - indicate this in the master * node. Also save the reserved GC LEB number. */ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); err = ubifs_write_master(c); if (err) /* * Recovery will attempt to fix the master area * next mount, so we just print a message and * continue to unmount normally. */ ubifs_err(c, "failed to write master node, error %d", err); } else { for (i = 0; i < c->jhead_cnt; i++) /* Make sure write-buffer timers are canceled */ hrtimer_cancel(&c->jheads[i].wbuf.timer); } } ubifs_umount(c); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); } static int ubifs_reconfigure(struct fs_context *fc) { struct ubifs_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; int err; struct ubifs_info *c = sb->s_fs_info; sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags); /* * Apply the mount option changes. * auth_key_name and auth_hash_name are ignored on remount. */ c->mount_opts = ctx->mount_opts; c->bulk_read = ctx->bulk_read; c->no_chk_data_crc = ctx->no_chk_data_crc; c->default_compr = ctx->default_compr; c->assert_action = ctx->assert_action; if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; } if (c->ro_media) { ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O"); return -EROFS; } err = ubifs_remount_rw(c); if (err) return err; } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; } ubifs_remount_ro(c); } if (c->bulk_read == 1) bu_init(c); else { dbg_gen("disable bulk-read"); mutex_lock(&c->bu_mutex); kfree(c->bu.buf); c->bu.buf = NULL; mutex_unlock(&c->bu_mutex); } if (!c->need_recovery) ubifs_assert(c, c->lst.taken_empty_lebs > 0); return 0; } const struct super_operations ubifs_super_operations = { .alloc_inode = ubifs_alloc_inode, .free_inode = ubifs_free_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, .drop_inode = ubifs_drop_inode, .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, .show_options = ubifs_show_options, .sync_fs = ubifs_sync_fs, }; /** * open_ubi - parse UBI device name string and open the UBI device. * @fc: The filesystem context * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume * character device node path. However, UBIFS may also be mounted without any * character device node using one of the following methods: * * o ubiX_Y - mount UBI device number X, volume Y; * o ubiY - mount UBI device number 0, volume Y; * o ubiX:NAME - mount UBI device X, volume with name NAME; * o ubi:NAME - mount UBI device 0, volume with name NAME. * * Alternative '!' separator may be used instead of ':' (because some shells * like busybox may interpret ':' as an NFS host name separator). This function * returns UBI volume description object in case of success and a negative * error code in case of failure. */ static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode) { struct ubi_volume_desc *ubi; const char *name = fc->source; int dev, vol; char *endptr; /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) return ubi; /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') goto invalid_source; /* ubi:NAME method */ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') return ubi_open_volume_nm(0, name + 4, mode); if (!isdigit(name[3])) goto invalid_source; dev = simple_strtoul(name + 3, &endptr, 0); /* ubiY method */ if (*endptr == '\0') return ubi_open_volume(0, dev, mode); /* ubiX_Y method */ if (*endptr == '_' && isdigit(endptr[1])) { vol = simple_strtoul(endptr + 1, &endptr, 0); if (*endptr != '\0') goto invalid_source; return ubi_open_volume(dev, vol, mode); } /* ubiX:NAME method */ if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') return ubi_open_volume_nm(dev, ++endptr, mode); invalid_source: return ERR_PTR(invalf(fc, "Invalid source name")); } static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) { struct ubifs_info *c; c = kzalloc_obj(struct ubifs_info); if (c) { spin_lock_init(&c->cnt_lock); spin_lock_init(&c->cs_lock); spin_lock_init(&c->buds_lock); spin_lock_init(&c->space_lock); spin_lock_init(&c->orphan_lock); init_rwsem(&c->commit_sem); mutex_init(&c->lp_mutex); mutex_init(&c->tnc_mutex); mutex_init(&c->log_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); init_waitqueue_head(&c->reserve_space_wq); atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; c->orph_tree = RB_ROOT; INIT_LIST_HEAD(&c->infos_list); INIT_LIST_HEAD(&c->idx_gc); INIT_LIST_HEAD(&c->replay_list); INIT_LIST_HEAD(&c->replay_buds); INIT_LIST_HEAD(&c->uncat_list); INIT_LIST_HEAD(&c->empty_list); INIT_LIST_HEAD(&c->freeable_list); INIT_LIST_HEAD(&c->frdi_idx_list); INIT_LIST_HEAD(&c->unclean_leb_list); INIT_LIST_HEAD(&c->old_buds); INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); c->no_chk_data_crc = 1; c->assert_action = ASSACT_RO; c->highest_inum = UBIFS_FIRST_INO; c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; ubi_get_volume_info(ubi, &c->vi); ubi_get_device_info(c->vi.ubi_num, &c->di); } return c; } static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c = sb->s_fs_info; struct ubifs_fs_context *ctx = fc->fs_private; struct inode *root; int err; c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); goto out; } /* Copy in parsed mount options */ c->mount_opts = ctx->mount_opts; c->auth_key_name = ctx->auth_key_name; c->auth_hash_name = ctx->auth_hash_name; c->no_chk_data_crc = ctx->no_chk_data_crc; c->bulk_read = ctx->bulk_read; c->default_compr = ctx->default_compr; c->assert_action = ctx->assert_action; /* ubifs_info owns auth strings now */ ctx->auth_key_name = NULL; ctx->auth_hash_name = NULL; /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in read_folio, * which means the user would have to wait not just for their own I/O * but the read-ahead I/O as well i.e. completely pointless. * * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also * @sb->s_bdi->capabilities are initialized to 0 so there won't be any * writeback happening. */ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num, c->vi.vol_id); if (err) goto out_close; sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; fscrypt_set_ops(sb, &ubifs_crypt_operations); mutex_lock(&c->umount_mutex); err = mount_ubifs(c); if (err) { ubifs_assert(c, err < 0); goto out_unlock; } /* Read the root inode */ root = ubifs_iget(sb, UBIFS_ROOT_INO); if (IS_ERR(root)) { err = PTR_ERR(root); goto out_umount; } generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_umount; } super_set_uuid(sb, c->uuid, sizeof(c->uuid)); super_set_sysfs_name_generic(sb, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); mutex_unlock(&c->umount_mutex); return 0; out_umount: ubifs_umount(c); out_unlock: mutex_unlock(&c->umount_mutex); out_close: ubifs_release_options(c); ubi_close_volume(c->ubi); out: return err; } static int sb_test(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c1 = fc->s_fs_info; struct ubifs_info *c = sb->s_fs_info; return c->vi.cdev == c1->vi.cdev; } static int ubifs_get_tree(struct fs_context *fc) { struct ubi_volume_desc *ubi; struct ubifs_info *c; struct super_block *sb; int err; if (!fc->source || !*fc->source) return invalf(fc, "No source specified"); dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags); /* * Get UBI device number and volume ID. Mount it read-only so far * because this might be a new mount point, and UBI allows only one * read-write user at a time. */ ubi = open_ubi(fc, UBI_READONLY); if (IS_ERR(ubi)) { err = PTR_ERR(ubi); if (!(fc->sb_flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", current->pid, fc->source, err); return err; } c = alloc_ubifs_info(ubi); if (!c) { err = -ENOMEM; goto out_close; } fc->s_fs_info = c; dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); sb = sget_fc(fc, sb_test, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); goto out_close; } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { err = ubifs_fill_super(sb, fc); if (err) goto out_deact; /* We do not support atime */ sb->s_flags |= SB_ACTIVE; if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) ubifs_msg(c, "full atime support is enabled."); else sb->s_flags |= SB_NOATIME; } /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); fc->root = dget(sb->s_root); return 0; out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); return err; } static void kill_ubifs_super(struct super_block *s) { struct ubifs_info *c = s->s_fs_info; kill_anon_super(s); kfree(c); } static void ubifs_free_fc(struct fs_context *fc) { struct ubifs_fs_context *ctx = fc->fs_private; if (ctx) { kfree(ctx->auth_key_name); kfree(ctx->auth_hash_name); kfree(ctx); } } static const struct fs_context_operations ubifs_context_ops = { .free = ubifs_free_fc, .parse_param = ubifs_parse_param, .get_tree = ubifs_get_tree, .reconfigure = ubifs_reconfigure, }; static int ubifs_init_fs_context(struct fs_context *fc) { struct ubifs_fs_context *ctx; ctx = kzalloc_obj(struct ubifs_fs_context); if (!ctx) return -ENOMEM; if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { /* Iniitialize for first mount */ ctx->no_chk_data_crc = 1; ctx->assert_action = ASSACT_RO; } else { struct ubifs_info *c = fc->root->d_sb->s_fs_info; /* * Preserve existing options across remounts. * auth_key_name and auth_hash_name are not remountable. */ ctx->mount_opts = c->mount_opts; ctx->bulk_read = c->bulk_read; ctx->no_chk_data_crc = c->no_chk_data_crc; ctx->default_compr = c->default_compr; ctx->assert_action = c->assert_action; } fc->ops = &ubifs_context_ops; fc->fs_private = ctx; return 0; } static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .init_fs_context = ubifs_init_fs_context, .parameters = ubifs_fs_param_spec, .kill_sb = kill_ubifs_super, }; MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. */ static void inode_slab_ctor(void *obj) { struct ubifs_inode *ui = obj; inode_init_once(&ui->vfs_inode); } static int __init ubifs_init(void) { int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); /* Make sure node sizes are 8-byte aligned */ BUILD_BUG_ON(UBIFS_CH_SZ & 7); BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); BUILD_BUG_ON(MIN_WRITE_SZ & 7); /* Check min. node size */ BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); /* Defined node sizes */ BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); /* * We use 2 bit wide bit-fields to store compression type, which should * be amended if more compressors are added. The bit-fields are: * @compr_type in 'struct ubifs_inode', @default_compr in * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. */ BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); /* * We require that PAGE_SIZE is greater-than-or-equal-to * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_SIZE < UBIFS_BLOCK_SIZE) { pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", current->pid, (unsigned int)PAGE_SIZE); return -EINVAL; } ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); if (!ubifs_shrinker_info) goto out_slab; ubifs_shrinker_info->count_objects = ubifs_shrink_count; ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; shrinker_register(ubifs_shrinker_info); err = ubifs_compressors_init(); if (err) goto out_shrinker; dbg_debugfs_init(); err = ubifs_sysfs_init(); if (err) goto out_dbg; err = register_filesystem(&ubifs_fs_type); if (err) { pr_err("UBIFS error (pid %d): cannot register file system, error %d", current->pid, err); goto out_sysfs; } return 0; out_sysfs: ubifs_sysfs_exit(); out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; } /* late_initcall to let compressors initialize first */ late_initcall(ubifs_init); static void __exit ubifs_exit(void) { WARN_ON(!list_empty(&ubifs_infos)); WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ubifs_inode_slab); unregister_filesystem(&ubifs_fs_type); } module_exit(ubifs_exit); MODULE_LICENSE("GPL"); MODULE_VERSION(__stringify(UBIFS_VERSION)); MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); MODULE_DESCRIPTION("UBIFS - UBI File System"); |
| 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | /* * Copyright © 2014 Intel Corporation * Daniel Vetter <daniel.vetter@ffwll.ch> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef __DRM_INTERNAL_H__ #define __DRM_INTERNAL_H__ #include <linux/kthread.h> #include <linux/types.h> #include <drm/drm_ioctl.h> #include <drm/drm_vblank.h> #define DRM_IF_MAJOR 1 #define DRM_IF_MINOR 4 #define DRM_IF_VERSION(maj, min) (maj << 16 | min) struct dentry; struct dma_buf; struct iosys_map; struct drm_connector; struct drm_crtc; struct drm_framebuffer; struct drm_gem_object; struct drm_master; struct drm_minor; struct drm_prime_file_private; struct drm_printer; struct drm_vblank_crtc; /* drm_client_event.c */ #if defined(CONFIG_DRM_CLIENT) void drm_client_debugfs_init(struct drm_device *dev); #else static inline void drm_client_debugfs_init(struct drm_device *dev) { } #endif /* drm_client_sysrq.c */ #if defined(CONFIG_DRM_CLIENT) && defined(CONFIG_MAGIC_SYSRQ) void drm_client_sysrq_register(struct drm_device *dev); void drm_client_sysrq_unregister(struct drm_device *dev); #else static inline void drm_client_sysrq_register(struct drm_device *dev) { } static inline void drm_client_sysrq_unregister(struct drm_device *dev) { } #endif /* drm_file.c */ extern struct mutex drm_global_mutex; bool drm_dev_needs_global_mutex(struct drm_device *dev); struct drm_file *drm_file_alloc(struct drm_minor *minor); void drm_file_free(struct drm_file *file); #ifdef CONFIG_PCI /* drm_pci.c */ int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master); #else static inline int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master) { return -EINVAL; } #endif /* drm_prime.c */ int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv); void drm_prime_destroy_file_private(struct drm_prime_file_private *prime_fpriv); int drm_prime_add_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t handle); void drm_prime_remove_buf_handle(struct drm_prime_file_private *prime_fpriv, uint32_t handle); /* drm_managed.c */ void drm_managed_release(struct drm_device *dev); void drmm_add_final_kfree(struct drm_device *dev, void *container); /* drm_vblank.c */ static inline bool drm_vblank_passed(u64 seq, u64 ref) { return (seq - ref) <= (1 << 23); } void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe); int drm_vblank_get(struct drm_device *dev, unsigned int pipe); void drm_vblank_put(struct drm_device *dev, unsigned int pipe); u64 drm_vblank_count(struct drm_device *dev, unsigned int pipe); /* drm_vblank_work.c */ static inline void drm_vblank_flush_worker(struct drm_vblank_crtc *vblank) { kthread_flush_worker(vblank->worker); } static inline void drm_vblank_destroy_worker(struct drm_vblank_crtc *vblank) { if (vblank->worker) kthread_destroy_worker(vblank->worker); } int drm_vblank_worker_init(struct drm_vblank_crtc *vblank); void drm_vblank_cancel_pending_works(struct drm_vblank_crtc *vblank); void drm_handle_vblank_works(struct drm_vblank_crtc *vblank); /* IOCTLS */ int drm_wait_vblank_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); /* drm_irq.c */ /* IOCTLS */ int drm_crtc_get_sequence_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); int drm_crtc_queue_sequence_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); /* drm_auth.c */ int drm_getmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_authmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_setmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_dropmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_master_open(struct drm_file *file_priv); void drm_master_release(struct drm_file *file_priv); bool drm_master_internal_acquire(struct drm_device *dev); void drm_master_internal_release(struct drm_device *dev); /* drm_sysfs.c */ extern struct class *drm_class; int drm_sysfs_init(void); void drm_sysfs_destroy(void); struct device *drm_sysfs_minor_alloc(struct drm_minor *minor); int drm_sysfs_connector_add(struct drm_connector *connector); int drm_sysfs_connector_add_late(struct drm_connector *connector); void drm_sysfs_connector_remove_early(struct drm_connector *connector); void drm_sysfs_connector_remove(struct drm_connector *connector); void drm_sysfs_lease_event(struct drm_device *dev); /* drm_gem.c */ int drm_gem_init(struct drm_device *dev); bool drm_gem_object_handle_get_if_exists_unlocked(struct drm_gem_object *obj); void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj); int drm_gem_handle_create_tail(struct drm_file *file_priv, struct drm_gem_object *obj, u32 *handlep); int drm_gem_close_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_flink_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_open_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); void drm_gem_open(struct drm_device *dev, struct drm_file *file_private); void drm_gem_release(struct drm_device *dev, struct drm_file *file_private); void drm_gem_print_info(struct drm_printer *p, unsigned int indent, const struct drm_gem_object *obj); int drm_gem_vmap_locked(struct drm_gem_object *obj, struct iosys_map *map); void drm_gem_vunmap_locked(struct drm_gem_object *obj, struct iosys_map *map); /* drm_debugfs.c drm_debugfs_crc.c */ #if defined(CONFIG_DEBUG_FS) void drm_debugfs_dev_fini(struct drm_device *dev); void drm_debugfs_dev_register(struct drm_device *dev); int drm_debugfs_register(struct drm_minor *minor, int minor_id); void drm_debugfs_unregister(struct drm_minor *minor); void drm_debugfs_connector_add(struct drm_connector *connector); void drm_debugfs_connector_remove(struct drm_connector *connector); void drm_debugfs_crtc_add(struct drm_crtc *crtc); void drm_debugfs_crtc_remove(struct drm_crtc *crtc); void drm_debugfs_crtc_crc_add(struct drm_crtc *crtc); void drm_debugfs_encoder_add(struct drm_encoder *encoder); void drm_debugfs_encoder_remove(struct drm_encoder *encoder); #else static inline void drm_debugfs_dev_fini(struct drm_device *dev) { } static inline void drm_debugfs_dev_register(struct drm_device *dev) { } static inline int drm_debugfs_register(struct drm_minor *minor, int minor_id) { return 0; } static inline void drm_debugfs_unregister(struct drm_minor *minor) { } static inline void drm_debugfs_connector_add(struct drm_connector *connector) { } static inline void drm_debugfs_connector_remove(struct drm_connector *connector) { } static inline void drm_debugfs_crtc_add(struct drm_crtc *crtc) { } static inline void drm_debugfs_crtc_remove(struct drm_crtc *crtc) { } static inline void drm_debugfs_crtc_crc_add(struct drm_crtc *crtc) { } static inline void drm_debugfs_encoder_add(struct drm_encoder *encoder) { } static inline void drm_debugfs_encoder_remove(struct drm_encoder *encoder) { } #endif drm_ioctl_t drm_version; drm_ioctl_t drm_getunique; drm_ioctl_t drm_getclient; /* drm_syncobj.c */ void drm_syncobj_open(struct drm_file *file_private); void drm_syncobj_release(struct drm_file *file_private); int drm_syncobj_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_destroy_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_handle_to_fd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_fd_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_transfer_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_eventfd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private); /* drm_framebuffer.c */ void drm_framebuffer_print_info(struct drm_printer *p, unsigned int indent, const struct drm_framebuffer *fb); void drm_framebuffer_debugfs_init(struct drm_device *dev); #endif /* __DRM_INTERNAL_H__ */ |
| 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | // SPDX-License-Identifier: GPL-2.0-only /* * Support for polling mode for input devices. */ #include <linux/device.h> #include <linux/export.h> #include <linux/input.h> #include <linux/jiffies.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include "input-poller.h" struct input_dev_poller { void (*poll)(struct input_dev *dev); unsigned int poll_interval; /* msec */ unsigned int poll_interval_max; /* msec */ unsigned int poll_interval_min; /* msec */ struct input_dev *input; struct delayed_work work; }; static void input_dev_poller_queue_work(struct input_dev_poller *poller) { unsigned long delay; delay = msecs_to_jiffies(poller->poll_interval); if (delay >= HZ) delay = round_jiffies_relative(delay); queue_delayed_work(system_freezable_wq, &poller->work, delay); } static void input_dev_poller_work(struct work_struct *work) { struct input_dev_poller *poller = container_of(work, struct input_dev_poller, work.work); poller->poll(poller->input); input_dev_poller_queue_work(poller); } void input_dev_poller_finalize(struct input_dev_poller *poller) { if (!poller->poll_interval) poller->poll_interval = 500; if (!poller->poll_interval_max) poller->poll_interval_max = poller->poll_interval; } void input_dev_poller_start(struct input_dev_poller *poller) { /* Only start polling if polling is enabled */ if (poller->poll_interval > 0) { poller->poll(poller->input); input_dev_poller_queue_work(poller); } } void input_dev_poller_stop(struct input_dev_poller *poller) { cancel_delayed_work_sync(&poller->work); } int input_setup_polling(struct input_dev *dev, void (*poll_fn)(struct input_dev *dev)) { struct input_dev_poller *poller; poller = kzalloc_obj(*poller); if (!poller) { /* * We want to show message even though kzalloc() may have * printed backtrace as knowing what instance of input * device we were dealing with is helpful. */ dev_err(dev->dev.parent ?: &dev->dev, "%s: unable to allocate poller structure\n", __func__); return -ENOMEM; } INIT_DELAYED_WORK(&poller->work, input_dev_poller_work); poller->input = dev; poller->poll = poll_fn; dev->poller = poller; return 0; } EXPORT_SYMBOL(input_setup_polling); static bool input_dev_ensure_poller(struct input_dev *dev) { if (!dev->poller) { dev_err(dev->dev.parent ?: &dev->dev, "poller structure has not been set up\n"); return false; } return true; } void input_set_poll_interval(struct input_dev *dev, unsigned int interval) { if (input_dev_ensure_poller(dev)) dev->poller->poll_interval = interval; } EXPORT_SYMBOL(input_set_poll_interval); void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval) { if (input_dev_ensure_poller(dev)) dev->poller->poll_interval_min = interval; } EXPORT_SYMBOL(input_set_min_poll_interval); void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval) { if (input_dev_ensure_poller(dev)) dev->poller->poll_interval_max = interval; } EXPORT_SYMBOL(input_set_max_poll_interval); int input_get_poll_interval(struct input_dev *dev) { if (!dev->poller) return -EINVAL; return dev->poller->poll_interval; } EXPORT_SYMBOL(input_get_poll_interval); /* SYSFS interface */ static ssize_t input_dev_get_poll_interval(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input = to_input_dev(dev); return sprintf(buf, "%d\n", input->poller->poll_interval); } static ssize_t input_dev_set_poll_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct input_dev *input = to_input_dev(dev); struct input_dev_poller *poller = input->poller; unsigned int interval; int err; err = kstrtouint(buf, 0, &interval); if (err) return err; if (interval < poller->poll_interval_min) return -EINVAL; if (interval > poller->poll_interval_max) return -EINVAL; guard(mutex)(&input->mutex); poller->poll_interval = interval; if (input_device_enabled(input)) { cancel_delayed_work_sync(&poller->work); if (poller->poll_interval > 0) input_dev_poller_queue_work(poller); } return count; } static DEVICE_ATTR(poll, 0644, input_dev_get_poll_interval, input_dev_set_poll_interval); static ssize_t input_dev_get_poll_max(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input = to_input_dev(dev); return sprintf(buf, "%d\n", input->poller->poll_interval_max); } static DEVICE_ATTR(max, 0444, input_dev_get_poll_max, NULL); static ssize_t input_dev_get_poll_min(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input = to_input_dev(dev); return sprintf(buf, "%d\n", input->poller->poll_interval_min); } static DEVICE_ATTR(min, 0444, input_dev_get_poll_min, NULL); static umode_t input_poller_attrs_visible(struct kobject *kobj, struct attribute *attr, int n) { struct device *dev = kobj_to_dev(kobj); struct input_dev *input = to_input_dev(dev); return input->poller ? attr->mode : 0; } static struct attribute *input_poller_attrs[] = { &dev_attr_poll.attr, &dev_attr_max.attr, &dev_attr_min.attr, NULL }; struct attribute_group input_poller_attribute_group = { .is_visible = input_poller_attrs_visible, .attrs = input_poller_attrs, }; |
| 1 1 1 1 1 3 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 | // SPDX-License-Identifier: GPL-2.0 /* * thermal.c - sysfs interface of thermal devices * * Copyright (C) 2016 Eduardo Valentin <edubezval@gmail.com> * * Highly based on original thermal_core.c * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/container_of.h> #include <linux/sysfs.h> #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_choices.h> #include <linux/jiffies.h> #include "thermal_core.h" /* sys I/F for thermal zone */ static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); return sysfs_emit(buf, "%s\n", tz->type); } static ssize_t temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); int temperature, ret; ret = thermal_zone_get_temp(tz, &temperature); if (!ret) return sysfs_emit(buf, "%d\n", temperature); if (ret == -EAGAIN) return -ENODATA; return ret; } static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); guard(thermal_zone)(tz); return sysfs_emit(buf, "%s\n", str_enabled_disabled(tz->mode == THERMAL_DEVICE_ENABLED)); } static ssize_t mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); int result; if (!strncmp(buf, "enabled", sizeof("enabled") - 1)) result = thermal_zone_device_enable(tz); else if (!strncmp(buf, "disabled", sizeof("disabled") - 1)) result = thermal_zone_device_disable(tz); else result = -EINVAL; if (result) return result; return count; } #define thermal_trip_of_attr(_ptr_, _attr_) \ ({ \ struct thermal_trip_desc *td; \ \ td = container_of(_ptr_, struct thermal_trip_desc, \ trip_attrs._attr_.attr); \ &td->trip; \ }) static ssize_t trip_point_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_trip *trip = thermal_trip_of_attr(attr, type); return sysfs_emit(buf, "%s\n", thermal_trip_type_name(trip->type)); } static ssize_t trip_point_temp_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_trip *trip = thermal_trip_of_attr(attr, temp); struct thermal_zone_device *tz = to_thermal_zone(dev); int temp; if (kstrtoint(buf, 10, &temp)) return -EINVAL; guard(thermal_zone)(tz); if (temp == trip->temperature) return count; /* Arrange the condition to avoid integer overflows. */ if (temp != THERMAL_TEMP_INVALID && temp <= trip->hysteresis + THERMAL_TEMP_INVALID) return -EINVAL; if (tz->ops.set_trip_temp) { int ret; ret = tz->ops.set_trip_temp(tz, trip, temp); if (ret) return ret; } thermal_zone_set_trip_temp(tz, trip, temp); __thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED); return count; } static ssize_t trip_point_temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_trip *trip = thermal_trip_of_attr(attr, temp); return sysfs_emit(buf, "%d\n", READ_ONCE(trip->temperature)); } static ssize_t trip_point_hyst_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_trip *trip = thermal_trip_of_attr(attr, hyst); struct thermal_zone_device *tz = to_thermal_zone(dev); int hyst; if (kstrtoint(buf, 10, &hyst) || hyst < 0) return -EINVAL; guard(thermal_zone)(tz); if (hyst == trip->hysteresis) return count; /* * Allow the hysteresis to be updated when the temperature is invalid * to allow user space to avoid having to adjust hysteresis after a * valid temperature has been set, but in that case just change the * value and do nothing else. */ if (trip->temperature == THERMAL_TEMP_INVALID) { WRITE_ONCE(trip->hysteresis, hyst); return count; } if (trip->temperature - hyst <= THERMAL_TEMP_INVALID) return -EINVAL; thermal_zone_set_trip_hyst(tz, trip, hyst); __thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED); return count; } static ssize_t trip_point_hyst_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_trip *trip = thermal_trip_of_attr(attr, hyst); return sysfs_emit(buf, "%d\n", READ_ONCE(trip->hysteresis)); } static ssize_t policy_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); char name[THERMAL_NAME_LENGTH]; int ret; strscpy(name, buf); ret = thermal_zone_device_set_policy(tz, name); if (!ret) ret = count; return ret; } static ssize_t policy_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); return sysfs_emit(buf, "%s\n", tz->governor->name); } static ssize_t available_policies_show(struct device *dev, struct device_attribute *devattr, char *buf) { return thermal_build_list_of_policies(buf); } #if (IS_ENABLED(CONFIG_THERMAL_EMULATION)) static ssize_t emul_temp_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); int temperature; if (kstrtoint(buf, 10, &temperature)) return -EINVAL; guard(thermal_zone)(tz); if (tz->ops.set_emul_temp) { int ret; ret = tz->ops.set_emul_temp(tz, temperature); if (ret) return ret; } else { tz->emul_temperature = temperature; } __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); return count; } static DEVICE_ATTR_WO(emul_temp); #endif static ssize_t sustainable_power_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); if (tz->tzp) return sysfs_emit(buf, "%u\n", tz->tzp->sustainable_power); else return -EIO; } static ssize_t sustainable_power_store(struct device *dev, struct device_attribute *devattr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); u32 sustainable_power; if (!tz->tzp) return -EIO; if (kstrtou32(buf, 10, &sustainable_power)) return -EINVAL; tz->tzp->sustainable_power = sustainable_power; return count; } #define create_s32_tzp_attr(name) \ static ssize_t \ name##_show(struct device *dev, struct device_attribute *devattr, \ char *buf) \ { \ struct thermal_zone_device *tz = to_thermal_zone(dev); \ \ if (tz->tzp) \ return sysfs_emit(buf, "%d\n", tz->tzp->name); \ else \ return -EIO; \ } \ \ static ssize_t \ name##_store(struct device *dev, struct device_attribute *devattr, \ const char *buf, size_t count) \ { \ struct thermal_zone_device *tz = to_thermal_zone(dev); \ s32 value; \ \ if (!tz->tzp) \ return -EIO; \ \ if (kstrtos32(buf, 10, &value)) \ return -EINVAL; \ \ tz->tzp->name = value; \ \ return count; \ } \ static DEVICE_ATTR_RW(name) create_s32_tzp_attr(k_po); create_s32_tzp_attr(k_pu); create_s32_tzp_attr(k_i); create_s32_tzp_attr(k_d); create_s32_tzp_attr(integral_cutoff); create_s32_tzp_attr(slope); create_s32_tzp_attr(offset); #undef create_s32_tzp_attr /* * These are thermal zone device attributes that will always be present. * All the attributes created for tzp (create_s32_tzp_attr) also are always * present on the sysfs interface. */ static DEVICE_ATTR_RO(type); static DEVICE_ATTR_RO(temp); static DEVICE_ATTR_RW(policy); static DEVICE_ATTR_RO(available_policies); static DEVICE_ATTR_RW(sustainable_power); /* These thermal zone device attributes are created based on conditions */ static DEVICE_ATTR_RW(mode); /* These attributes are unconditionally added to a thermal zone */ static struct attribute *thermal_zone_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_temp.attr, #if (IS_ENABLED(CONFIG_THERMAL_EMULATION)) &dev_attr_emul_temp.attr, #endif &dev_attr_policy.attr, &dev_attr_available_policies.attr, &dev_attr_sustainable_power.attr, &dev_attr_k_po.attr, &dev_attr_k_pu.attr, &dev_attr_k_i.attr, &dev_attr_k_d.attr, &dev_attr_integral_cutoff.attr, &dev_attr_slope.attr, &dev_attr_offset.attr, NULL, }; static const struct attribute_group thermal_zone_attribute_group = { .attrs = thermal_zone_dev_attrs, }; static struct attribute *thermal_zone_mode_attrs[] = { &dev_attr_mode.attr, NULL, }; static const struct attribute_group thermal_zone_mode_attribute_group = { .attrs = thermal_zone_mode_attrs, }; static const struct attribute_group *thermal_zone_attribute_groups[] = { &thermal_zone_attribute_group, &thermal_zone_mode_attribute_group, /* This is not NULL terminated as we create the group dynamically */ }; /** * create_trip_attrs() - create attributes for trip points * @tz: the thermal zone device * * helper function to instantiate sysfs entries for every trip * point and its properties of a struct thermal_zone_device. * * Return: 0 on success, the proper error value otherwise. */ static int create_trip_attrs(struct thermal_zone_device *tz) { struct thermal_trip_desc *td; struct attribute **attrs; int i; attrs = kzalloc_objs(*attrs, tz->num_trips * 3 + 1); if (!attrs) return -ENOMEM; i = 0; for_each_trip_desc(tz, td) { struct thermal_trip_attrs *trip_attrs = &td->trip_attrs; /* create trip type attribute */ snprintf(trip_attrs->type.name, THERMAL_NAME_LENGTH, "trip_point_%d_type", i); sysfs_attr_init(&trip_attrs->type.attr.attr); trip_attrs->type.attr.attr.name = trip_attrs->type.name; trip_attrs->type.attr.attr.mode = S_IRUGO; trip_attrs->type.attr.show = trip_point_type_show; attrs[i] = &trip_attrs->type.attr.attr; /* create trip temp attribute */ snprintf(trip_attrs->temp.name, THERMAL_NAME_LENGTH, "trip_point_%d_temp", i); sysfs_attr_init(&trip_attrs->temp.attr.attr); trip_attrs->temp.attr.attr.name = trip_attrs->temp.name; trip_attrs->temp.attr.attr.mode = S_IRUGO; trip_attrs->temp.attr.show = trip_point_temp_show; if (td->trip.flags & THERMAL_TRIP_FLAG_RW_TEMP) { trip_attrs->temp.attr.attr.mode |= S_IWUSR; trip_attrs->temp.attr.store = trip_point_temp_store; } attrs[i + tz->num_trips] = &trip_attrs->temp.attr.attr; snprintf(trip_attrs->hyst.name, THERMAL_NAME_LENGTH, "trip_point_%d_hyst", i); sysfs_attr_init(&trip_attrs->hyst.attr.attr); trip_attrs->hyst.attr.attr.name = trip_attrs->hyst.name; trip_attrs->hyst.attr.attr.mode = S_IRUGO; trip_attrs->hyst.attr.show = trip_point_hyst_show; if (td->trip.flags & THERMAL_TRIP_FLAG_RW_HYST) { trip_attrs->hyst.attr.attr.mode |= S_IWUSR; trip_attrs->hyst.attr.store = trip_point_hyst_store; } attrs[i + 2 * tz->num_trips] = &trip_attrs->hyst.attr.attr; i++; } attrs[tz->num_trips * 3] = NULL; tz->trips_attribute_group.attrs = attrs; return 0; } /** * destroy_trip_attrs() - destroy attributes for trip points * @tz: the thermal zone device * * helper function to free resources allocated by create_trip_attrs() */ static void destroy_trip_attrs(struct thermal_zone_device *tz) { if (tz) kfree(tz->trips_attribute_group.attrs); } int thermal_zone_create_device_groups(struct thermal_zone_device *tz) { const struct attribute_group **groups; int i, size, result; /* we need one extra for trips and the NULL to terminate the array */ size = ARRAY_SIZE(thermal_zone_attribute_groups) + 2; /* This also takes care of API requirement to be NULL terminated */ groups = kzalloc_objs(*groups, size); if (!groups) return -ENOMEM; for (i = 0; i < size - 2; i++) groups[i] = thermal_zone_attribute_groups[i]; if (tz->num_trips) { result = create_trip_attrs(tz); if (result) { kfree(groups); return result; } groups[size - 2] = &tz->trips_attribute_group; } tz->device.groups = groups; return 0; } void thermal_zone_destroy_device_groups(struct thermal_zone_device *tz) { if (!tz) return; if (tz->num_trips) destroy_trip_attrs(tz); kfree(tz->device.groups); } /* sys I/F for cooling device */ static ssize_t cdev_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); return sysfs_emit(buf, "%s\n", cdev->type); } static ssize_t max_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); return sysfs_emit(buf, "%ld\n", cdev->max_state); } static ssize_t cur_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); unsigned long state; int ret; ret = cdev->ops->get_cur_state(cdev, &state); if (ret) return ret; return sysfs_emit(buf, "%ld\n", state); } static ssize_t cur_state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_cooling_device *cdev = to_cooling_device(dev); unsigned long state; int result; if (sscanf(buf, "%ld\n", &state) != 1) return -EINVAL; if ((long)state < 0) return -EINVAL; /* Requested state should be less than max_state + 1 */ if (state > cdev->max_state) return -EINVAL; guard(cooling_dev)(cdev); result = cdev->ops->set_cur_state(cdev, state); if (result) return result; thermal_cooling_device_stats_update(cdev, state); return count; } static struct device_attribute dev_attr_cdev_type = __ATTR(type, 0444, cdev_type_show, NULL); static DEVICE_ATTR_RO(max_state); static DEVICE_ATTR_RW(cur_state); static struct attribute *cooling_device_attrs[] = { &dev_attr_cdev_type.attr, &dev_attr_max_state.attr, &dev_attr_cur_state.attr, NULL, }; static const struct attribute_group cooling_device_attr_group = { .attrs = cooling_device_attrs, }; static const struct attribute_group *cooling_device_attr_groups[] = { &cooling_device_attr_group, NULL, /* Space allocated for cooling_device_stats_attr_group */ NULL, }; #ifdef CONFIG_THERMAL_STATISTICS struct cooling_dev_stats { spinlock_t lock; unsigned int total_trans; unsigned long state; ktime_t last_time; ktime_t *time_in_state; unsigned int *trans_table; }; static void update_time_in_state(struct cooling_dev_stats *stats) { ktime_t now = ktime_get(), delta; delta = ktime_sub(now, stats->last_time); stats->time_in_state[stats->state] = ktime_add(stats->time_in_state[stats->state], delta); stats->last_time = now; } void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev, unsigned long new_state) { struct cooling_dev_stats *stats = cdev->stats; lockdep_assert_held(&cdev->lock); if (!stats) return; spin_lock(&stats->lock); if (stats->state == new_state) goto unlock; update_time_in_state(stats); stats->trans_table[stats->state * (cdev->max_state + 1) + new_state]++; stats->state = new_state; stats->total_trans++; unlock: spin_unlock(&stats->lock); } static ssize_t total_trans_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); struct cooling_dev_stats *stats; int ret; guard(cooling_dev)(cdev); stats = cdev->stats; if (!stats) return 0; spin_lock(&stats->lock); ret = sysfs_emit(buf, "%u\n", stats->total_trans); spin_unlock(&stats->lock); return ret; } static ssize_t time_in_state_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); struct cooling_dev_stats *stats; ssize_t len = 0; int i; guard(cooling_dev)(cdev); stats = cdev->stats; if (!stats) return 0; spin_lock(&stats->lock); update_time_in_state(stats); for (i = 0; i <= cdev->max_state; i++) { len += sysfs_emit_at(buf, len, "state%u\t%llu\n", i, ktime_to_ms(stats->time_in_state[i])); } spin_unlock(&stats->lock); return len; } static ssize_t reset_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_cooling_device *cdev = to_cooling_device(dev); struct cooling_dev_stats *stats; int i, states; guard(cooling_dev)(cdev); stats = cdev->stats; if (!stats) return count; states = cdev->max_state + 1; spin_lock(&stats->lock); stats->total_trans = 0; stats->last_time = ktime_get(); memset(stats->trans_table, 0, states * states * sizeof(*stats->trans_table)); for (i = 0; i < states; i++) stats->time_in_state[i] = ktime_set(0, 0); spin_unlock(&stats->lock); return count; } static ssize_t trans_table_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); struct cooling_dev_stats *stats; ssize_t len = 0; int i, j; guard(cooling_dev)(cdev); stats = cdev->stats; if (!stats) return -ENODATA; len += snprintf(buf + len, PAGE_SIZE - len, " From : To\n"); len += snprintf(buf + len, PAGE_SIZE - len, " : "); for (i = 0; i <= cdev->max_state; i++) { if (len >= PAGE_SIZE) break; len += snprintf(buf + len, PAGE_SIZE - len, "state%2u ", i); } if (len >= PAGE_SIZE) return PAGE_SIZE; len += snprintf(buf + len, PAGE_SIZE - len, "\n"); for (i = 0; i <= cdev->max_state; i++) { if (len >= PAGE_SIZE) break; len += snprintf(buf + len, PAGE_SIZE - len, "state%2u:", i); for (j = 0; j <= cdev->max_state; j++) { if (len >= PAGE_SIZE) break; len += snprintf(buf + len, PAGE_SIZE - len, "%8u ", stats->trans_table[i * (cdev->max_state + 1) + j]); } if (len >= PAGE_SIZE) break; len += snprintf(buf + len, PAGE_SIZE - len, "\n"); } if (len >= PAGE_SIZE) { pr_warn_once("Thermal transition table exceeds PAGE_SIZE. Disabling\n"); len = -EFBIG; } return len; } static DEVICE_ATTR_RO(total_trans); static DEVICE_ATTR_RO(time_in_state_ms); static DEVICE_ATTR_WO(reset); static DEVICE_ATTR_RO(trans_table); static struct attribute *cooling_device_stats_attrs[] = { &dev_attr_total_trans.attr, &dev_attr_time_in_state_ms.attr, &dev_attr_reset.attr, &dev_attr_trans_table.attr, NULL }; static const struct attribute_group cooling_device_stats_attr_group = { .attrs = cooling_device_stats_attrs, .name = "stats" }; static void cooling_device_stats_setup(struct thermal_cooling_device *cdev) { const struct attribute_group *stats_attr_group = NULL; struct cooling_dev_stats *stats; /* Total number of states is highest state + 1 */ unsigned long states = cdev->max_state + 1; int var; var = sizeof(*stats); var += sizeof(*stats->time_in_state) * states; var += sizeof(*stats->trans_table) * states * states; stats = kzalloc(var, GFP_KERNEL); if (!stats) goto out; stats->time_in_state = (ktime_t *)(stats + 1); stats->trans_table = (unsigned int *)(stats->time_in_state + states); cdev->stats = stats; stats->last_time = ktime_get(); spin_lock_init(&stats->lock); stats_attr_group = &cooling_device_stats_attr_group; out: /* Fill the empty slot left in cooling_device_attr_groups */ var = ARRAY_SIZE(cooling_device_attr_groups) - 2; cooling_device_attr_groups[var] = stats_attr_group; } static void cooling_device_stats_destroy(struct thermal_cooling_device *cdev) { kfree(cdev->stats); cdev->stats = NULL; } #else static inline void cooling_device_stats_setup(struct thermal_cooling_device *cdev) {} static inline void cooling_device_stats_destroy(struct thermal_cooling_device *cdev) {} #endif /* CONFIG_THERMAL_STATISTICS */ void thermal_cooling_device_setup_sysfs(struct thermal_cooling_device *cdev) { cooling_device_stats_setup(cdev); cdev->device.groups = cooling_device_attr_groups; } void thermal_cooling_device_destroy_sysfs(struct thermal_cooling_device *cdev) { cooling_device_stats_destroy(cdev); } void thermal_cooling_device_stats_reinit(struct thermal_cooling_device *cdev) { lockdep_assert_held(&cdev->lock); cooling_device_stats_destroy(cdev); cooling_device_stats_setup(cdev); } /* these helper will be used only at the time of bindig */ ssize_t trip_point_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); struct thermal_instance *instance; instance = container_of(attr, struct thermal_instance, attr); return sysfs_emit(buf, "%d\n", thermal_zone_trip_id(tz, instance->trip)); } ssize_t weight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_instance *instance; instance = container_of(attr, struct thermal_instance, weight_attr); return sysfs_emit(buf, "%d\n", instance->weight); } ssize_t weight_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); struct thermal_instance *instance; int ret, weight; ret = kstrtoint(buf, 0, &weight); if (ret) return ret; instance = container_of(attr, struct thermal_instance, weight_attr); /* Don't race with governors using the 'weight' value */ guard(thermal_zone)(tz); instance->weight = weight; thermal_governor_update_tz(tz, THERMAL_INSTANCE_WEIGHT_CHANGED); return count; } |
| 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 94 94 94 231 231 231 231 231 6 6 6 6 6 231 231 231 231 231 231 349 350 349 350 349 1 1 1 129 129 130 9 4 4 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 | // SPDX-License-Identifier: GPL-2.0 /* * kernel/cpuset.c * * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2007 Silicon Graphics, Inc. * Copyright (C) 2006 Google, Inc * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky */ #include "cpuset-internal.h" #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/oom.h> #include <linux/sched/isolation.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", [PERR_ACCESS] = "Enable partition not permitted", [PERR_REMOTE] = "Have remote partition underneath", }; /* * CPUSET Locking Convention * ------------------------- * * Below are the four global/local locks guarding cpuset structures in lock * acquisition order: * - cpuset_top_mutex * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock) * - cpuset_mutex * - callback_lock (raw spinlock) * * As cpuset will now indirectly flush a number of different workqueues in * housekeeping_update() to update housekeeping cpumasks when the set of * isolated CPUs is going to be changed, it may be vulnerable to deadlock * if we hold cpus_read_lock while calling into housekeeping_update(). * * The first cpuset_top_mutex will be held except when calling into * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock * and cpuset_mutex will be held instead. The main purpose of this mutex * is to prevent regular cpuset control file write actions from interfering * with the call to housekeeping_update(), though CPU hotplug operation can * still happen in parallel. This mutex also provides protection for some * internal variables. * * A task must hold all the remaining three locks to modify externally visible * or used fields of cpusets, though some of the internally used cpuset fields * and internal variables can be modified without holding callback_lock. If only * reliable read access of the externally used fields are needed, a task can * hold either cpuset_mutex or callback_lock which are exposed to other * external subsystems. * * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others, * ensuring that it is the only task able to also acquire callback_lock and * be able to modify cpusets. It can perform various checks on the cpuset * structure first, knowing nothing will change. It can also allocate memory * without holding callback_lock. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. Once * it is ready to make the changes, it takes callback_lock, blocking everyone * else. * * Calls to the kernel memory allocator cannot be made while holding * callback_lock which is a spinlock, as the memory allocator may sleep or * call back into cpuset code and acquire callback_lock. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. */ static DEFINE_MUTEX(cpuset_top_mutex); static DEFINE_MUTEX(cpuset_mutex); /* * File level internal variables below follow one of the following exclusion * rules. * * RWCS: Read/write-able by holding either cpus_write_lock (and optionally * cpuset_mutex) or both cpus_read_lock and cpuset_mutex. * * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable * by holding both cpuset_mutex and callback_lock. * * T: Read/write-able by holding the cpuset_top_mutex. */ /* * For local partitions, update to subpartitions_cpus & isolated_cpus is done * in update_parent_effective_cpumask(). For remote partitions, it is done in * the remote_partition_*() and remote_cpus_update() helpers. */ /* * Exclusive CPUs distributed out to local or remote sub-partitions of * top_cpuset */ static cpumask_var_t subpartitions_cpus; /* RWCS */ /* * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated) */ static cpumask_var_t isolated_cpus; /* CSCB */ /* * Set if housekeeping cpumasks are to be updated. */ static bool update_housekeeping; /* RWCS */ /* * Copy of isolated_cpus to be passed to housekeeping_update() */ static cpumask_var_t isolated_hk_cpus; /* T */ /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in * - update_partition_sd_lb() * - update_cpumasks_hier() * - cpuset_update_flag() * - cpuset_hotplug_update_tasks() * - cpuset_handle_hotplug() * * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. * * Note that update_relax_domain_level() in cpuset-v1.c can still call * rebuild_sched_domains_locked() directly without using this flag. */ static bool force_sd_rebuild; /* RWCS */ /* * Partition root states: * * 0 - member (not a partition root) * 1 - partition root * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root * * There are 2 types of partitions - local or remote. Local partitions are * those whose parents are partition root themselves. Setting of * cpuset.cpus.exclusive are optional in setting up local partitions. * Remote partitions are those whose parents are not partition roots. Passing * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor * nodes are mandatory in creating a remote partition. * * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. * * A valid partition can be formed by setting exclusive_cpus or cpus_allowed * if exclusive_cpus is not set. In the case of partition with empty * exclusive_cpus, all the conflicting exclusive CPUs specified in the * following cpumasks of sibling cpusets will be removed from its * cpus_allowed in determining its effective_xcpus. * - effective_xcpus * - exclusive_cpus * * The "cpuset.cpus.exclusive" control file should be used for setting up * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 #define PRS_ISOLATED 2 #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. */ struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); cs->nr_deadline_tasks++; } void dec_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); cs->nr_deadline_tasks--; } static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } static inline bool cs_is_member(const struct cpuset *cs) { return cs->partition_root_state == PRS_MEMBER; } /* * Callers should hold callback_lock to modify partition_root_state. */ static inline void make_partition_invalid(struct cpuset *cs) { if (cs->partition_root_state > 0) cs->partition_root_state = -cs->partition_root_state; } /* * Send notification event of whenever partition_root_state changes. */ static inline void notify_partition_change(struct cpuset *cs, int old_prs) { if (old_prs == cs->partition_root_state) return; cgroup_file_notify(&cs->partition_file); /* Reset prs_err if not invalid */ if (is_partition_valid(cs)) WRITE_ONCE(cs->prs_err, PERR_NONE); } /* * The top_cpuset is always synchronized to cpu_active_mask and we should avoid * using cpu_online_mask as much as possible. An active CPU is always an online * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ * during hotplug operations. A CPU is marked active at the last stage of CPU * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code * will be called to update the sched domains so that the scheduler can move * a normal task to a newly active CPU or remove tasks away from a newly * inactivated CPU. The online bit is set much earlier in the CPU bringup * process and cleared much later in CPU teardown. * * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .dl_bw_cpu = -1, }; /** * cpuset_lock - Acquire the global cpuset mutex * * This locks the global cpuset mutex to prevent modifications to cpuset * hierarchy and configurations. This helper is not enough to make modification. */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); } void cpuset_unlock(void) { mutex_unlock(&cpuset_mutex); } void lockdep_assert_cpuset_lock_held(void) { lockdep_assert_held(&cpuset_mutex); } /** * cpuset_full_lock - Acquire full protection for cpuset modification * * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex * to safely modify cpuset data. */ void cpuset_full_lock(void) { mutex_lock(&cpuset_top_mutex); cpus_read_lock(); mutex_lock(&cpuset_mutex); } void cpuset_full_unlock(void) { mutex_unlock(&cpuset_mutex); cpus_read_unlock(); mutex_unlock(&cpuset_top_mutex); } #ifdef CONFIG_LOCKDEP bool lockdep_is_cpuset_held(void) { return lockdep_is_held(&cpuset_mutex) || lockdep_is_held(&cpuset_top_mutex); } #endif static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) { spin_lock_irq(&callback_lock); } void cpuset_callback_unlock_irq(void) { spin_unlock_irq(&callback_lock); } static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); } } /* * decrease cs->attach_in_progress. * wake_up cpuset_attach_wq if cs->attach_in_progress==0. */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); } static inline void dec_attach_in_progress(struct cpuset *cs) { mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } static inline bool cpuset_v2(void) { return !IS_ENABLED(CONFIG_CPUSETS_V1) || cgroup_subsys_on_dfl(cpuset_cgrp_subsys); } /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. * With v2 behavior, "cpus" and "mems" are always what the users have * requested and won't be changed by hotplug events. Only the effective * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * * @cs should be a valid partition root or going to become a partition root. * @excluded_child should be non-NULL when this cpuset is going to become a * partition itself. * * Note that a remote partition is not allowed underneath a valid local * or remote partition. So if a non-partition root child is populated, * the whole partition is considered populated. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; /* * We cannot call cs_is_populated(cs) directly, as * nr_populated_domain_children may include populated * csets from descendants that are partitions. */ if (cs->css.cgroup->nr_populated_csets || cs->attach_in_progress) return true; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { if (cp == cs || cp == excluded_child) continue; if (is_partition_valid(cp)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (cpuset_is_populated(cp)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, * walk up the cpuset hierarchy until we find one that does have some * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); } /** * alloc_cpumasks - Allocate an array of cpumask variables * @pmasks: Pointer to array of cpumask_var_t pointers * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * * Allocates @size cpumasks and initializes them to empty. Returns 0 on * success, -ENOMEM on allocation failure. On failure, any previously * allocated cpumasks are freed. */ static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { int i; for (i = 0; i < size; i++) { if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { while (--i >= 0) free_cpumask_var(*pmasks[i]); return -ENOMEM; } } return 0; } /** * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. * @tmp: Pointer to tmpmasks structure to populate * Return: 0 on success, -ENOMEM on allocation failure */ static inline int alloc_tmpmasks(struct tmpmasks *tmp) { /* * Array of pointers to the three cpumask_var_t fields in tmpmasks. * Note: Array size must match actual number of masks (3) */ cpumask_var_t *pmask[3] = { &tmp->new_cpus, &tmp->addmask, &tmp->delmask }; return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ static inline void free_tmpmasks(struct tmpmasks *tmp) { if (!tmp) return; free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); } /** * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset * @cs: Source cpuset to duplicate (NULL for a fresh allocation) * * Creates a new cpuset by either: * 1. Duplicating an existing cpuset (if @cs is non-NULL), or * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) * * Return: Pointer to newly allocated cpuset on success, NULL on failure */ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; /* Allocate base structure */ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : kzalloc_obj(*cs); if (!trial) return NULL; trial->dl_bw_cpu = -1; /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, &trial->effective_cpus, &trial->effective_xcpus, &trial->exclusive_cpus }; if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } /* Copy masks if duplicating */ if (cs) { cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); } return trial; } /** * free_cpuset - free the cpuset * @cs: the cpuset to be freed */ static inline void free_cpuset(struct cpuset *cs) { free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->effective_xcpus); free_cpumask_var(cs->exclusive_cpus); kfree(cs); } /* Return user specified exclusive CPUs */ static inline struct cpumask *user_xcpus(struct cpuset *cs) { return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed : cs->exclusive_cpus; } static inline bool xcpus_empty(struct cpuset *cs) { return cpumask_empty(cs->cpus_allowed) && cpumask_empty(cs->exclusive_cpus); } /* * cpusets_are_exclusive() - check if two cpusets are exclusive * * Return true if exclusive, false if not */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { struct cpumask *xcpus1 = user_xcpus(cs1); struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; return true; } /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts * @trial: the trial cpuset to be checked * @sibling: a sibling cpuset to be checked against * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: * o cgroup v1 * See cpuset1_cpus_excl_conflict() * o cgroup v2 * - The exclusive_cpus values cannot overlap. * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, bool xcpus_changed) { if (!cpuset_v2()) return cpuset1_cpus_excl_conflict(trial, sibling); /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; /* Exclusive_cpus cannot intersect */ return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) { if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); return false; } /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; bool xcpus_changed; int ret = 0; rcu_read_lock(); if (!is_in_v2_mode()) ret = cpuset1_validate_change(cur, trial); if (ret) goto out; /* Remaining checks don't apply to root cpuset */ if (cur == &top_cpuset) goto out; par = parent_cs(cur); /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. * * For v1, effective_cpus == cpus_allowed & user_xcpus() returns * cpus_allowed. * * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only * for non-isolated partition root. At this point, the target * effective_cpus isn't computed yet. user_xcpus() is the best * approximation. * * TBD: May need to precompute the real effective_cpus here in case * incorrect scheduling of SCHED_DEADLINE tasks in a partition * becomes an issue. */ ret = -EBUSY; if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; } ret = 0; out: rcu_read_unlock(); return ret; } #ifdef CONFIG_SMP /* * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this * routine would rather not worry about failures to rebuild sched * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a * top-down scan of all cpusets. For our purposes, rebuilding * the schedulers sched domains, we can ignore !is_sched_load_ * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, * i.e the set of domains (subsets) of CPUs such that the * cpus_allowed of every cpuset marked is_sched_load_balance * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ struct cgroup_subsys_state *pos_css; if (!cpuset_v2()) return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; /* !csa will be checked and can be correctly handled */ goto generate_doms; } csa = kmalloc_objs(cp, nr_cpusets()); if (!csa) goto done; /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { /* * Only valid partition roots that are not isolated and with * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no * exclusive CPUs to be granted to child cpusets. */ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); for (i = 0; i < ndoms; i++) { for (j = i + 1; j < ndoms; j++) { if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ WARN_ON_ONCE(1); } } generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc_objs(struct sched_domain_attr, ndoms); /* * Cgroup v2 doesn't support domain attributes, just set all of them * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs. */ for (i = 0; i < ndoms; i++) { /* * The top cpuset may contain some boot time isolated * CPUs that need to be excluded from the sched domain. */ if (!csa || csa[i] == &top_cpuset) cpumask_and(doms[i], top_cpuset.effective_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); else cpumask_copy(doms[i], csa[i]->effective_cpus); if (dattr) dattr[i] = SD_ATTR_INIT; } done: kfree(csa); /* * Fallback to the default domain if kmalloc() failed. * See comments in partition_sched_domains(). */ if (doms == NULL) ndoms = 1; *domains = doms; *attributes = dattr; return ndoms; } static void dl_update_tasks_root_domain(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; if (cs->nr_deadline_tasks == 0) return; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) dl_add_task_root_domain(task); css_task_iter_end(&it); } void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; int cpu; u64 cookie = ++dl_cookie; lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); rcu_read_lock(); for_each_possible_cpu(cpu) { if (dl_bw_visited(cpu, cookie)) continue; dl_clear_root_domain_cpu(cpu); } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (cpumask_empty(cs->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } css_get(&cs->css); rcu_read_unlock(); dl_update_tasks_root_domain(cs); rcu_read_lock(); css_put(&cs->css); } rcu_read_unlock(); } /* * Rebuild scheduler domains. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; int i; lockdep_assert_cpus_held(); lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* * cpuset_hotplug_workfn is invoked synchronously now, thus this * function should not race with CPU hotplug. And the effective CPUs * must not include any offline CPUs. Passing an offline CPU in the * doms to partition_sched_domains() will trigger a kernel panic. * * We perform a final check here: if the doms contains any * offline CPUs, a warning is emitted and we return directly to * prevent the panic. */ for (i = 0; doms && i < ndoms; i++) { if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) return; } /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ static void rebuild_sched_domains_cpuslocked(void) { mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); } void rebuild_sched_domains(void) { cpus_read_lock(); rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } void cpuset_reset_sched_domains(void) { mutex_lock(&cpuset_mutex); partition_sched_domains(1, NULL, NULL); mutex_unlock(&cpuset_mutex); } /** * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. * * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus * to make sure all offline CPUs are also included as hotplug code won't * update cpumasks for tasks in top_cpuset. * * As task_cpu_possible_mask() can be task dependent in arm64, we have to * do cpu masking per task instead of doing it once for all. */ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { const struct cpumask *possible_mask = task_cpu_possible_mask(task); if (top_cs) { /* * PF_KTHREAD tasks are handled by housekeeping. * PF_NO_SETAFFINITY tasks are ignored. */ if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY)) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { cpumask_and(new_cpus, possible_mask, cs->effective_cpus); } set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } /** * compute_effective_cpumask - Compute the effective cpumask of the cpuset * @new_cpus: the temp variable for the new effective_cpus mask * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } /* * Commands for update_parent_effective_cpumask */ enum partition_cmd { partcmd_enable, /* Enable partition root */ partcmd_enablei, /* Enable isolated partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's effective_cpus */ partcmd_invalidate, /* Make partition invalid */ }; static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); /* * Update partition exclusive flag * * Return: 0 if successful, an error code otherwise */ static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) { bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } /* * Update partition load balance flag and/or rebuild sched domain * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { int new_prs = cs->partition_root_state; bool rebuild_domains = (new_prs > 0) || (old_prs > 0); bool new_lb; /* * If cs is not a valid partition root, the load balance state * will follow its parent. */ if (new_prs > 0) { new_lb = (new_prs != PRS_ISOLATED); } else { new_lb = is_sched_load_balance(parent_cs(cs)); } if (new_lb != !!is_sched_load_balance(cs)) { rebuild_domains = true; if (new_lb) set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); else clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } if (rebuild_domains) cpuset_force_rebuild(); } /* * tasks_nocpu_error - Return true if tasks will have no effective_cpus */ static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, struct cpumask *xcpus) { /* * A populated partition (cs or parent) can't have empty effective_cpus */ return (cpumask_subset(parent->effective_cpus, xcpus) && partition_is_populated(parent, cs)) || (!cpumask_intersects(xcpus, cpu_active_mask) && partition_is_populated(cs, NULL)); } static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); } /* * isolated_cpus_update - Update the isolated_cpus mask * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); lockdep_assert_held(&callback_lock); lockdep_assert_held(&cpuset_mutex); if (new_prs == PRS_ISOLATED) { if (cpumask_subset(xcpus, isolated_cpus)) return; cpumask_or(isolated_cpus, isolated_cpus, xcpus); } else { if (!cpumask_intersects(xcpus, isolated_cpus)) return; cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); } update_housekeeping = true; } /* * partition_xcpus_add - Add new exclusive CPUs to partition * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added * * Remote partition if parent == NULL */ static void partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); if (new_prs != parent->partition_root_state) isolated_cpus_update(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); } /* * partition_xcpus_del - Remove exclusive CPUs from partition * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed * * Remote partition if parent == NULL */ static void partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); if (old_prs != parent->partition_root_state) isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask); } /* * isolated_cpus_can_update - check for isolated & nohz_full conflicts * @add_cpus: cpu mask for cpus that are going to be isolated * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL * Return: false if there is conflict, true otherwise * * If nohz_full is enabled and we have isolated CPUs, their combination must * still leave housekeeping CPUs. * * TBD: Should consider merging this function into * prstate_housekeeping_conflict(). */ static bool isolated_cpus_can_update(struct cpumask *add_cpus, struct cpumask *del_cpus) { cpumask_var_t full_hk_cpus; int res = true; if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) return true; if (del_cpus && cpumask_weight_and(del_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) return true; if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) return false; cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), housekeeping_cpumask(HK_TYPE_DOMAIN)); cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) res = false; free_cpumask_var(full_hk_cpus); return res; } /* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) return false; if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT))) return true; return false; } /* * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock * * Update housekeeping cpumasks and rebuild sched domains if necessary and * then do a cpuset_full_unlock(). * This should be called at the end of cpuset operation. */ static void cpuset_update_sd_hk_unlock(void) __releases(&cpuset_mutex) __releases(&cpuset_top_mutex) { /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */ if (force_sd_rebuild) rebuild_sched_domains_locked(); if (update_housekeeping) { update_housekeeping = false; cpumask_copy(isolated_hk_cpus, isolated_cpus); /* * housekeeping_update() is now called without holding * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex * is still being held for mutual exclusion. */ mutex_unlock(&cpuset_mutex); cpus_read_unlock(); WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus)); mutex_unlock(&cpuset_top_mutex); } else { cpuset_full_unlock(); } } /* * Work function to invoke cpuset_update_sd_hk_unlock() */ static void hk_sd_workfn(struct work_struct *work) { cpuset_full_lock(); cpuset_update_sd_hk_unlock(); } /** * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets * @parent: Parent cpuset containing all siblings * @cs: Current cpuset (will be skipped) * @excpus: exclusive effective CPU mask to modify * * This function ensures the given @excpus mask doesn't include any CPUs that * are exclusively allocated to sibling cpusets. It walks through all siblings * of @cs under @parent and removes their exclusive CPUs from @excpus. */ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, struct cpumask *excpus) { struct cgroup_subsys_state *css; struct cpuset *sibling; int retval = 0; if (cpumask_empty(excpus)) return 0; /* * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { struct cpumask *sibling_xcpus; if (sibling == cs) continue; /* * If exclusive_cpus is defined, effective_xcpus will always * be a subset. Otherwise, effective_xcpus will only be set * in a valid partition root. */ sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) ? sibling->effective_xcpus : sibling->exclusive_cpus; if (cpumask_intersects(excpus, sibling_xcpus)) { cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } rcu_read_unlock(); return retval; } /* * compute_excpus - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set * Return: 0 if there is no sibling conflict, > 0 otherwise * * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets * and exclude their exclusive_cpus or effective_xcpus as well. */ static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) { struct cpuset *parent = parent_cs(cs); cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); if (!cpumask_empty(cs->exclusive_cpus)) return 0; return rm_siblings_excl_cpus(parent, cs, excpus); } /* * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset * @trialcs: The trial cpuset containing the proposed new configuration * @cs: The original cpuset that the trial configuration is based on * Return: 0 if successful with no sibling conflict, >0 if a conflict is found * * Computes the effective_xcpus for a trial configuration. @cs is provided to represent * the real cs. */ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) { struct cpuset *parent = parent_cs(trialcs); struct cpumask *excpus = trialcs->effective_xcpus; /* trialcs is member, cpuset.cpus has no impact to excpus */ if (cs_is_member(cs)) cpumask_and(excpus, trialcs->exclusive_cpus, parent->effective_xcpus); else cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); return rm_siblings_excl_cpus(parent, cs, excpus); } static inline bool is_remote_partition(struct cpuset *cs) { return cs->remote_partition; } static inline bool is_local_partition(struct cpuset *cs) { return is_partition_valid(cs) && !is_remote_partition(cs); } /* * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temporary masks * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. */ static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { /* * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * * The effective_xcpus mask can contain offline CPUs, but there must * be at least one or more online CPUs present before it can be enabled. * * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; if (((new_prs == PRS_ISOLATED) && !isolated_cpus_can_update(tmp->new_cpus, NULL)) || prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) return PERR_HKEEPING; spin_lock_irq(&callback_lock); partition_xcpus_add(new_prs, NULL, tmp->new_cpus); cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); cpuset_force_rebuild(); cs->prs_err = 0; /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return 0; } /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update * @tmp: temporary masks * * The effective_cpus is also updated. * * cpuset_mutex must be held by the caller. */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { WARN_ON_ONCE(!is_remote_partition(cs)); /* * When a CPU is offlined, top_cpuset may end up with no available CPUs, * which should clear subpartitions_cpus. We should not emit a warning for this * scenario: the hierarchy is updated from top to bottom, so subpartitions_cpus * may already be cleared when disabling the partition. */ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus) && !cpumask_empty(subpartitions_cpus)); spin_lock_irq(&callback_lock); cs->remote_partition = false; partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; else cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @xcpus: the new exclusive_cpus mask, if non-NULL * @excpus: the new effective_xcpus mask * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, struct cpumask *excpus, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); if (cpumask_empty(excpus)) { cs->prs_err = PERR_CPUSEMPTY; goto invalidate; } adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ if (adding) { WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); if (!capable(CAP_SYS_ADMIN)) cs->prs_err = PERR_ACCESS; else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) cs->prs_err = PERR_NOCPUS; else if ((prs == PRS_ISOLATED) && !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) cs->prs_err = PERR_HKEEPING; if (cs->prs_err) goto invalidate; } spin_lock_irq(&callback_lock); if (adding) partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) partition_xcpus_del(prs, NULL, tmp->delmask); /* * Need to update effective_xcpus and exclusive_cpus now as * update_sibling_cpumasks() below may iterate back to the same cs. */ cpumask_copy(cs->effective_xcpus, excpus); if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); if (adding || deleting) cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; invalidate: remote_partition_disable(cs, tmp); } /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * * For partcmd_enable*, the cpuset is being transformed from a non-partition * root to a partition root. The effective_xcpus (cpus_allowed if * effective_xcpus not set) mask of the given cpuset will be taken away from * parent's effective_cpus. The function will return 0 if all the CPUs listed * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in effective_xcpus will be * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. * * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { struct cpuset *parent = parent_cs(cs); int adding; /* Adding cpus to parent's effective_cpus */ int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ struct cpumask *xcpus = user_xcpus(cs); int parent_prs = parent->partition_root_state; bool nocpu; lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* * new_prs will only be changed for the partcmd_update and * partcmd_invalidate commands. */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { if (is_partition_invalid(cs)) return 0; /* * Make the current partition invalid. */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); if (old_prs > 0) new_prs = -old_prs; goto write_error; } /* * The parent must be a partition root. * The new cpumask, if present, or the current cpus_allowed must * not be empty. */ if (!is_partition_valid(parent)) { return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its * effective_xcpus is empty. */ if (cpumask_empty(xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && !isolated_cpus_can_update(xcpus, NULL)) return PERR_HKEEPING; if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* * This function will only be called when all the preliminary * checks have passed. At this point, the following condition * should hold. * * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus * * Warn if it is not the case. */ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); deleting = true; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus * (and maybe removed from subpartitions_cpus/isolated_cpus) * for valid partition root. xcpus may contain CPUs that * shouldn't be removed from the two global cpumasks. */ if (is_partition_valid(cs)) { cpumask_copy(tmp->addmask, cs->effective_xcpus); adding = true; } new_prs = PRS_MEMBER; } else if (newmask) { /* * Empty cpumask is not allowed */ if (cpumask_empty(newmask)) { part_error = PERR_CPUSEMPTY; goto write_error; } /* Check newmask again, whether cpus are available for parent/cs */ nocpu |= tasks_nocpu_error(parent, cs, newmask); /* * partcmd_update with newmask: * * Compute add/delete mask to/from effective_cpus * * For valid partition: * addmask = exclusive_cpus & ~newmask * & parent->effective_xcpus * delmask = newmask & ~exclusive_cpus * & parent->effective_xcpus * * For invalid partition: * delmask = newmask & parent->effective_xcpus * The partition may become valid soon. */ if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); } else { cpumask_andnot(tmp->addmask, xcpus, newmask); adding = cpumask_and(tmp->addmask, tmp->addmask, parent->effective_xcpus); cpumask_andnot(tmp->delmask, newmask, xcpus); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } /* * TBD: Invalidate a currently valid child root partition may * still break isolated_cpus_can_update() rule if parent is an * isolated partition. */ if (is_partition_valid(cs) && (old_prs != parent_prs)) { if ((parent_prs == PRS_ROOT) && /* Adding to parent means removing isolated CPUs */ !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) part_error = PERR_HKEEPING; if ((parent_prs == PRS_ISOLATED) && /* Adding to parent means adding isolated CPUs */ !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) part_error = PERR_HKEEPING; } /* * The new CPUs to be removed from parent's effective CPUs * must be present. */ if (deleting) { cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); } /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ if (nocpu && (!adding || !cpumask_intersects(tmp->addmask, cpu_active_mask))) { part_error = PERR_NOCPUS; deleting = false; adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); } } else { /* * partcmd_update w/o newmask * * delmask = effective_xcpus & parent->effective_cpus * * This can be called from: * 1) update_cpumasks_hier() * 2) cpuset_hotplug_update_tasks() * * Check to see if it can be transitioned from valid to * invalid partition or vice versa. * * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; bool exclusive = true; /* * Convert invalid partition to valid has to * pass the cpu exclusivity test. */ rcu_read_lock(); cpuset_for_each_child(child, css, parent) { if (child == cs) continue; if (!cpusets_are_exclusive(cs, child)) { exclusive = false; break; } } rcu_read_unlock(); if (exclusive) deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_cpus); else part_error = PERR_NOTEXCL; } } write_error: if (part_error) WRITE_ONCE(cs->prs_err, part_error); if (cmd == partcmd_update) { /* * Check for possible transition between valid and invalid * partition root. */ switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: if (part_error) new_prs = -old_prs; break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: if (!part_error) new_prs = -old_prs; break; } } if (!adding && !deleting && (new_prs == old_prs)) return 0; /* * Transitioning between invalid to valid or vice versa may require * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, * validate_change() has already been successfully called and * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { int err = update_partition_exclusive_flag(cs, new_prs); if (err) return err; } /* * Change the parent's effective_cpus & effective_xcpus (top cpuset * only). * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); if (old_prs != new_prs) cs->partition_root_state = new_prs; /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. */ if (adding) partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) partition_xcpus_add(new_prs, parent, tmp->delmask); spin_unlock_irq(&callback_lock); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); if (adding || deleting) { cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } /* * For partcmd_update without newmask, it is being called from * cpuset_handle_hotplug(). Update the load balance flag and * scheduling domain accordingly. */ if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); return 0; } /** * compute_partition_effective_cpumask - compute effective_cpus for partition * @cs: partition root cpuset * @new_ecpus: previously computed effective_cpus to be updated * * Compute the effective_cpus of a partition root by scanning effective_xcpus * of child partition roots and excluding their effective_xcpus. * * This has the side effect of invalidating valid child partition roots, * if necessary. Since it is called from either cpuset_hotplug_update_tasks() * or update_cpumasks_hier() where parent and children are modified * successively, we don't need to call update_parent_effective_cpumask() * and the child's effective_cpus will be updated in later iterations. * * Note that rcu_read_lock() is assumed to be held. */ static void compute_partition_effective_cpumask(struct cpuset *cs, struct cpumask *new_ecpus) { struct cgroup_subsys_state *css; struct cpuset *child; bool populated = partition_is_populated(cs, NULL); /* * Check child partition roots to see if they should be * invalidated when * 1) child effective_xcpus not a subset of new * excluisve_cpus * 2) All the effective_cpus will be used up and cp * has tasks */ compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); cpuset_for_each_child(child, css, cs) { if (!is_partition_valid(child)) continue; /* * There shouldn't be a remote partition underneath another * partition root. */ WARN_ON_ONCE(is_remote_partition(child)); child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) child->prs_err = PERR_INVCPUS; else if (populated && cpumask_subset(new_ecpus, child->effective_xcpus)) child->prs_err = PERR_NOCPUS; if (child->prs_err) { int old_prs = child->partition_root_state; /* * Invalidate child partition */ spin_lock_irq(&callback_lock); make_partition_invalid(child); spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; } cpumask_andnot(new_ecpus, new_ecpus, child->effective_xcpus); } rcu_read_unlock(); } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); bool remote = is_remote_partition(cp); bool update_parent = false; old_prs = new_prs = cp->partition_root_state; /* * For child remote partition root (!= cs), we need to call * remote_cpus_update() if effective_xcpus will be changed. * Otherwise, we can skip the whole subtree. * * remote_cpus_update() will reuse tmp->new_cpus only after * its value is being processed. */ if (remote && (cp != cs)) { compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } rcu_read_unlock(); remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); rcu_read_lock(); /* Remote partition may be invalidated */ new_prs = cp->partition_root_state; remote = (new_prs == old_prs); } if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); if (remote) goto get_css; /* Ready to update cpuset data */ /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call * update_parent_effective_cpumask() to check it. */ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { update_parent = true; goto update_parent_effective; } /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs unless * it is a partition root that has explicitly distributed * out all its CPUs. */ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); /* * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: update_parent = true; break; default: /* * When parent is not a partition root or is * invalid, child partition roots become * invalid too. */ if (is_partition_valid(cp)) new_prs = -cp->partition_root_state; WRITE_ONCE(cp->prs_err, is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART); break; } } get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); if (update_parent) { update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* * The cpuset partition_root_state may become * invalid. Capture it. */ new_prs = cp->partition_root_state; } spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; /* * Need to compute effective_xcpus if either exclusive_cpus * is non-empty or it is a valid partition root. */ if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) compute_excpus(cp, cp->effective_xcpus); if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); cpuset_update_tasks_cpumask(cp, tmp->new_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); else clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); } /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. * On default hierarchy, the cpuset needs to be a partition * root as well. */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cpuset_v2() || is_partition_valid(cp))) cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); } /** * update_sibling_cpumasks - Update siblings cpumasks * @parent: Parent cpuset * @cs: Current cpuset * @tmp: Temp variables */ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp) { struct cpuset *sibling; struct cgroup_subsys_state *pos_css; lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs || is_partition_valid(sibling)) continue; compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; if (!css_tryget_online(&sibling->css)) continue; rcu_read_unlock(); update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } rcu_read_unlock(); } static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; retval = cpulist_parse(buf, out_mask); if (retval < 0) return retval; if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) return -EINVAL; return 0; } /** * validate_partition - Validate a cpuset partition configuration * @cs: The cpuset to validate * @trialcs: The trial cpuset containing proposed configuration changes * * If any validation check fails, the appropriate error code is set in the * cpuset's prs_err field. * * Return: PRS error code (0 if valid, non-zero error code if invalid) */ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) { struct cpuset *parent = parent_cs(cs); if (cs_is_member(trialcs)) return PERR_NONE; if (cpumask_empty(trialcs->effective_xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(trialcs->partition_root_state, trialcs->effective_xcpus)) return PERR_HKEEPING; if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) return PERR_NOCPUS; return PERR_NONE; } /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified * @trialcs: The trial cpuset containing proposed configuration changes * @tmp: Temporary masks for intermediate calculations * * This function handles partition state transitions triggered by CPU mask changes. * CPU modifications may cause a partition to be disabled or require state updates. */ static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks *tmp) { enum prs_errcode prs_err; if (cs_is_member(cs)) return; prs_err = validate_partition(cs, trialcs); if (prs_err) trialcs->prs_err = cs->prs_err = prs_err; if (is_remote_partition(cs)) { if (trialcs->prs_err) remote_partition_disable(cs, tmp); else remote_cpus_update(cs, trialcs->exclusive_cpus, trialcs->effective_xcpus, tmp); } else { if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, tmp); } } /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; struct tmpmasks tmp; bool force = false; int old_prs = cs->partition_root_state; retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; retval = validate_change(cs, trialcs); if (retval < 0) return retval; if (alloc_tmpmasks(&tmp)) return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); if ((old_prs > 0) && !is_partition_valid(cs)) reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); free_tmpmasks(&tmp); return retval; } /** * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset * * The tasks' cpumask will be updated if cs is a valid partition root. */ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; struct tmpmasks tmp; bool force = false; int old_prs = cs->partition_root_state; retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; /* * Reject the change if there is exclusive CPUs conflict with * the siblings. */ if (compute_trialcs_excpus(trialcs, cs)) return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) return retval; if (alloc_tmpmasks(&tmp)) return -ENOMEM; trialcs->prs_err = PERR_NONE; partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); if ((old_prs > 0) && !is_partition_valid(cs)) reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ if (is_partition_valid(cs) || force) update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); free_tmpmasks(&tmp); return 0; } /* * Migrate memory region from one set of nodes to another. This is * performed asynchronously as it can be called from process migration path * holding locks involved in process management. All mm migrations are * performed in the queued order and can be waited for by flushing * cpuset_migrate_mm_wq. */ struct cpuset_migrate_mm_work { struct work_struct work; struct mm_struct *mm; nodemask_t from; nodemask_t to; }; static void cpuset_migrate_mm_workfn(struct work_struct *work) { struct cpuset_migrate_mm_work *mwork = container_of(work, struct cpuset_migrate_mm_work, work); /* on a wq worker, no need to worry about %current's mems_allowed */ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); mmput(mwork->mm); kfree(mwork); } static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct cpuset_migrate_mm_work *mwork; if (nodes_equal(*from, *to)) { mmput(mm); return; } mwork = kzalloc_obj(*mwork); if (mwork) { mwork->mm = mm; mwork->from = *from; mwork->to = *to; INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); queue_work(cpuset_migrate_mm_wq, &mwork->work); } else { mmput(mm); } } static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); kfree(head); } static void schedule_flush_migrate_mm(void) { struct callback_head *flush_cb; flush_cb = kzalloc_obj(struct callback_head); if (!flush_cb) return; init_task_work(flush_cb, flush_migrate_mm_task_workfn); if (task_work_add(current, flush_cb, TWA_RESUME)) kfree(flush_cb); } /* * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy * @tsk: the task to change * @newmems: new nodes that the task will be set * * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed * and rebind an eventual tasks' mempolicy. If the task is allocating in * parallel, it might temporarily see an empty intersection, which results in * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { task_lock(tsk); local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; write_seqcount_end(&tsk->mems_allowed_seq); local_irq_enable(); task_unlock(tsk); } static void *cpuset_being_rebound; /** * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; cpuset_change_task_nodemask(task, &newmems); mm = get_task_mm(task); if (!mm) continue; migrate = is_memory_migrate(cs); mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); else mmput(mm); } css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update * cs->old_mems_allowed. */ cs->old_mems_allowed = newmems; /* We're done rebinding vmas to this cpuset's new mems_allowed. */ cpuset_being_rebound = NULL; } /* * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree * @cs: the cpuset to consider * @new_mems: a temp variable for calculating new effective_mems * * When configured nodemask is changed, the effective nodemasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ if (is_in_v2_mode() && !has_mems) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ if (nodes_equal(*new_mems, cp->effective_mems)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); } /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed, and for each task in the cpuset, * update mems_allowed and rebind task's mempolicy and any vma * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * The validate_change() call ensures that cpusets with tasks have memory. */ retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) return retval; if (!nodes_subset(trialcs->mems_allowed, top_cpuset.mems_allowed)) return -EINVAL; /* No change? nothing to do */ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) return 0; retval = validate_change(cs, trialcs); if (retval < 0) return retval; check_insane_mems_config(&trialcs->mems_allowed); spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); return 0; } bool current_cpuset_is_being_rebound(void) { bool ret; rcu_read_lock(); ret = task_cs(current) == cpuset_being_rebound; rcu_read_unlock(); return ret; } /* * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * * Call with cpuset_mutex held. */ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; int err; trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; if (turning_on) set_bit(bit, &trialcs->flags); else clear_bit(bit, &trialcs->flags); err = validate_change(cs, trialcs); if (err < 0) goto out; balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { if (cpuset_v2()) cpuset_force_rebuild(); else rebuild_sched_domains_locked(); } if (spread_flag_changed) cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; } /** * update_prstate - update partition_root_state * @cs: the cpuset to update * @new_prs: new partition root state * Return: 0 if successful, != 0 if error * * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; bool isolcpus_updated = false; if (old_prs == new_prs) return 0; /* * Treat a previously invalid partition root as if it is a "member". */ if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); if (err) goto out; if (!old_prs) { /* * cpus_allowed and exclusive_cpus cannot be both empty. */ if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } /* * We don't support the creation of a new local partition with * a remote partition underneath it. This unsupported * setting can happen only if parent is the top_cpuset because * a remote partition cannot be created underneath an existing * local or remote partition. */ if ((parent == &top_cpuset) && cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { err = PERR_REMOTE; goto out; } /* * If parent is valid partition, enable local partiion. * Otherwise, enable a remote partition. */ if (is_partition_valid(parent)) { enum partition_cmd cmd = (new_prs == PRS_ROOT) ? partcmd_enable : partcmd_enablei; err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); } else { err = remote_partition_enable(cs, new_prs, &tmpmask); } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ if (((new_prs == PRS_ISOLATED) && !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) err = PERR_HKEEPING; else isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it * disables child partitions. */ if (is_remote_partition(cs)) remote_partition_disable(cs, &tmpmask); else update_parent_effective_cpumask(cs, partcmd_disable, NULL, &tmpmask); /* * Invalidation of child partitions will be done in * update_cpumasks_hier(). */ } out: /* * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error * happens. */ if (err) { new_prs = -new_prs; update_partition_exclusive_flag(cs, new_prs); } spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); /* A newly created partition must have effective_xcpus set */ WARN_ON_ONCE(!old_prs && (new_prs > 0) && cpumask_empty(cs->effective_xcpus)); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); free_tmpmasks(&tmpmask); return 0; } static struct cpuset *cpuset_attach_old_cs; /* * Check to see if a cpuset can accept a new task * For v1, cpus_allowed and mems_allowed can't be empty. * For v2, effective_cpus can't be empty. * Note that in v1, effective_cpus = cpus_allowed. */ static int cpuset_can_attach_check(struct cpuset *cs) { if (cpumask_empty(cs->effective_cpus) || (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) return -ENOSPC; return 0; } static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; cs->dl_bw_cpu = -1; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); oldcs = cpuset_attach_old_cs; cs = css_cs(css); mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; /* * Skip rights over task setsched check in v2 when nothing changes, * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ setsched_check = !cpuset_v2() || !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) || !nodes_equal(cs->effective_mems, oldcs->effective_mems); /* * A v1 cpuset with tasks will have no CPU left only when CPU hotplug * brings the last online CPU offline as users are not allowed to empty * cpuset.cpus when there are active tasks inside. When that happens, * we should allow tasks to migrate out without security check to make * sure they will be able to run after migration. */ if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus)) setsched_check = false; cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; if (setsched_check) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; cs->sum_migrate_dl_bw += task->dl.dl_bw; } } if (!cs->nr_migrate_dl_tasks) goto out_success; if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); if (unlikely(cpu >= nr_cpu_ids)) { reset_migrate_dl_data(cs); ret = -EINVAL; goto out_unlock; } ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); if (ret) { reset_migrate_dl_data(cs); goto out_unlock; } cs->dl_bw_cpu = cpu; } out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct cpuset *cs; cgroup_taskset_first(tset, &css); cs = css_cs(css); mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); if (cs->dl_bw_cpu >= 0) dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); mutex_unlock(&cpuset_mutex); } /* * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex); cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); /* * In the default hierarchy, enabling cpuset in the child cgroups * will trigger a number of cpuset_attach() calls with no change * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may * sleep and should be moved outside migration path proper. Skip it * if there is no change in effective_mems and CS_MEMORY_MIGRATE is * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; if (!is_memory_migrate(cs) && !mems_updated) goto out; cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* * old_mems_allowed is the same with mems_allowed * here, except if this task is being moved * automatically due to hotplug. In that case * @mems_allowed has been updated and is empty, so * @old_mems_allowed is the right nodesets that we * migrate mm from. */ if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); queue_task_work = true; } else mmput(mm); } } out: if (queue_task_work) schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; reset_migrate_dl_data(cs); } dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } /* * Common handling for a write to a "cpus" or "mems" file. */ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; /* root is read-only */ if (cs == &top_cpuset) return -EACCES; buf = strstrip(buf); cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; } switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; case FILE_EXCLUSIVE_CPULIST: retval = update_exclusive_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } free_cpuset(trialcs); out_unlock: cpuset_update_sd_hk_unlock(); if (of_cft(of)->private == FILE_MEMLIST) schedule_flush_migrate_mm(); return retval ?: nbytes; } /* * These ascii lists should be read in a single call, by using a user * buffer large enough to hold the entire map. If read in smaller * chunks, there is no guarantee of atomicity. Since the display format * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; int ret = 0; spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); break; case FILE_EFFECTIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); break; case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; case FILE_EXCLUSIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); break; case FILE_EFFECTIVE_XCPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); break; case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); break; case FILE_ISOLATED_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); break; default: ret = -EINVAL; } spin_unlock_irq(&callback_lock); return ret; } static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; switch (cs->partition_root_state) { case PRS_ROOT: seq_puts(seq, "root\n"); break; case PRS_ISOLATED: seq_puts(seq, "isolated\n"); break; case PRS_MEMBER: seq_puts(seq, "member\n"); break; case PRS_INVALID_ROOT: type = "root"; fallthrough; case PRS_INVALID_ISOLATED: if (!type) type = "isolated"; err = perr_strings[READ_ONCE(cs->prs_err)]; if (err) seq_printf(seq, "%s invalid (%s)\n", type, err); else seq_printf(seq, "%s invalid\n", type); break; } return 0; } static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); int val; int retval = -ENODEV; buf = strstrip(buf); if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) val = PRS_MEMBER; else if (!strcmp(buf, "isolated")) val = PRS_ISOLATED; else return -EINVAL; cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); cpuset_update_sd_hk_unlock(); return retval ?: nbytes; } /* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ static struct cftype dfl_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "mems", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, }, { .name = "mems.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, { .name = "cpus.partition", .seq_show = cpuset_partition_show, .write = cpuset_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), }, { .name = "cpus.exclusive", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_EXCLUSIVE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.exclusive.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_XCPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, .private = FILE_SUBPARTS_CPULIST, .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, { .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, .private = FILE_ISOLATED_CPULIST, .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ }; /** * cpuset_css_alloc - Allocate a cpuset css * @parent_css: Parent css of the control group that the new cpuset will be * part of * Return: cpuset css on success, -ENOMEM on failure. * * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return * top cpuset css otherwise. */ static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; if (!parent_css) return &top_cpuset.css; cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset1_init(cs); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; } static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); if (!parent) return 0; cpuset_full_lock(); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ if (cpuset_v2() && !is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); cpuset1_online_css(css); cpuset_full_unlock(); return 0; } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); cpuset_full_unlock(); } /* * If a dying cpuset has the 'cpus.partition' enabled, turn it off by * changing it back to member to free its exclusive CPUs back to the pool to * be used by other online cpusets. */ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); cpuset_update_sd_hk_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); free_cpuset(cs); } static void cpuset_bind(struct cgroup_subsys_state *root_css) { mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, top_cpuset.effective_cpus); top_cpuset.mems_allowed = top_cpuset.effective_mems; } spin_unlock_irq(&callback_lock); mutex_unlock(&cpuset_mutex); } /* * In case the child is cloned into a cpuset different from its parent, * additional checks are done to see if the move is allowed. */ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) { struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); bool same_cs; int ret; rcu_read_lock(); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) return 0; lockdep_assert_held(&cgroup_mutex); mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; ret = task_can_attach(task); if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) goto out_unlock; /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); bool same_cs; rcu_read_lock(); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) return; dec_attach_in_progress(cs); } /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { struct cpuset *cs; bool same_cs; rcu_read_lock(); cs = task_cs(task); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) { if (cs == &top_cpuset) return; set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; return; } /* CLONE_INTO_CGROUP */ mutex_lock(&cpuset_mutex); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, #ifdef CONFIG_CPUSETS_V1 .legacy_cftypes = cpuset1_files, #endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, }; /** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); cpumask_setall(top_cpuset.effective_xcpus); cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) cpumask_andnot(isolated_cpus, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); return 0; } static void hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { /* A partition root is allowed to have empty effective cpus */ if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); if (cpus_updated) cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) cpuset_update_tasks_nodemask(cs); } void cpuset_force_rebuild(void) { force_sd_rebuild = true; } /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * @tmp: the tmpmasks structure pointer * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated; bool mems_updated; bool remote; int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { mutex_unlock(&cpuset_mutex); goto retry; } parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); if (!tmp || !cs->partition_root_state) goto update_tasks; /* * Compute effective_cpus for valid partition root, may invalidate * child partition roots if necessary. */ remote = is_remote_partition(cs); if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) compute_partition_effective_cpumask(cs, &new_cpus); if (remote && (cpumask_empty(subpartitions_cpus) || (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; } /* * Force the partition to become invalid if either one of * the following conditions hold: * 1) empty effective cpus but not valid empty partition. * 2) parent is invalid or doesn't grant any cpus to child * partitions. * 3) subpartitions_cpus is empty. */ if (is_local_partition(cs) && (!is_partition_valid(parent) || tasks_nocpu_error(parent, cs, &new_cpus) || cpumask_empty(subpartitions_cpus))) partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one with a non-empty effective xcpus. */ else if (is_partition_valid(parent) && is_partition_invalid(cs) && !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { update_parent_effective_cpumask(cs, partcmd, NULL, tmp); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); } } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); if (!cpus_updated && !mems_updated) goto unlock; /* Hotplug doesn't affect this cpuset */ if (mems_updated) check_insane_mems_config(&new_mems); if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: mutex_unlock(&cpuset_mutex); } /** * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always * synchronized to cpu_active_mask and N_MEMORY, which is necessary in * order to make cpusets transparent (of no affect) on systems that are * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. * * CPU / memory hotplug is handled synchronously. */ static void cpuset_handle_hotplug(void) { static DECLARE_WORK(hk_sd_work, hk_sd_workfn); static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; /* * If subpartitions_cpus is populated, it is likely that the check * below will produce a false positive on cpus_updated when the cpu * list isn't changed. It is extra work, but it is better to be safe. */ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); cpuset_hotplug_update_tasks(cs, ptmp); rcu_read_lock(); css_put(&cs->css); } rcu_read_unlock(); } /* * rebuild_sched_domains() will always be called directly if needed * to make sure that newly added or removed CPU will be reflected in * the sched domains. However, if isolated partition invalidation * or recreation is being done (update_housekeeping set), a work item * will be queued to call housekeeping_update() to update the * corresponding housekeeping cpumasks after some slight delay. * * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that * is still pending. Before the pending bit is cleared, the work data * is copied out and work item dequeued. So it is possible to queue * the work again before the hk_sd_workfn() is invoked to process the * previously queued work. Since hk_sd_workfn() doesn't use the work * item at all, this is not a problem. */ if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); if (update_housekeeping) queue_work(system_dfl_wq, &hk_sd_work); free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ cpuset_handle_hotplug(); } /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { cpuset_handle_hotplug(); return NOTIFY_OK; } /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized */ void __init cpuset_init_smp(void) { /* * cpus_allowd/mems_allowed set to v2 values in the initial * cpuset_bind() call will be reset to v1 values in another * cpuset_bind() call when v1 cpuset is mounted. */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); } /* * Return cpus_allowed mask from a task's cpuset. */ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { struct cpuset *cs; cs = task_cs(tsk); if (cs != &top_cpuset) guarantee_active_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all * offline cpus in the allowed cpu list. */ if ((cs == &top_cpuset) || cpumask_empty(pmask)) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); /* * We first exclude cpus allocated to partitions. If there is no * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } } /** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Similir to cpuset_cpus_allowed() except that the caller must have acquired * cpuset_mutex. */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } /** * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_active_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; spin_lock_irqsave(&callback_lock, flags); __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } /** * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. * @tsk: pointer to task_struct with which the scheduler is struggling * * Description: In the case that the scheduler cannot find an allowed cpu in * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy * mode however, this value is the same as task_cs(tsk)->effective_cpus, * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. * * Returns true if the affinity of @tsk was changed, false otherwise. **/ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); const struct cpumask *cs_mask; bool changed = false; rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); /* * We own tsk->cpus_allowed, nobody can change it under us. * * But we used cs && cs->cpus_allowed lockless and thus can * race with cgroup_attach_task() or update_cpumask() and get * the wrong tsk->cpus_allowed. However, both cases imply the * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() * which takes task_rq_lock(). * * If we are called after it dropped the lock we must see all * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. * * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ return changed; } void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); } /** * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; unsigned long flags; spin_lock_irqsave(&callback_lock, flags); guarantee_online_mems(task_cs(tsk), &mask); spin_unlock_irqrestore(&callback_lock, flags); return mask; } /** * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return nodes_intersects(*nodemask, current->mems_allowed); } /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); return cs; } /* * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set * in alloc_flags. That logic and the checks below have the combined * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) return true; if (node_isset(node, current->mems_allowed)) return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } /** * cpuset_nodes_allowed - return effective_mems mask from a cgroup cpuset. * @cgroup: pointer to struct cgroup. * @mask: pointer to struct nodemask_t to be returned. * * Returns effective_mems mask from a cgroup cpuset if it is cgroup v2 and * has cpuset subsys. Otherwise, returns node_states[N_MEMORY]. * * This function intentionally avoids taking the cpuset_mutex or callback_lock * when accessing effective_mems. This is because the obtained effective_mems * is stale immediately after the query anyway (e.g., effective_mems is updated * immediately after releasing the lock but before returning). * * As a result, returned @mask may be empty because cs->effective_mems can be * rebound during this call. Besides, nodes in @mask are not guaranteed to be * online due to hot plugins. Callers should check the mask for validity on * return based on its subsequent use. **/ void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask) { struct cgroup_subsys_state *css; struct cpuset *cs; /* * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy * and mems_allowed is likely to be empty even if we could get to it, * so return directly to avoid taking a global lock on the empty check. */ if (!cgroup || !cpuset_v2()) { nodes_copy(*mask, node_states[N_MEMORY]); return; } css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); if (!css) { nodes_copy(*mask, node_states[N_MEMORY]); return; } /* * The reference taken via cgroup_get_e_css is sufficient to * protect css, but it does not imply safe accesses to effective_mems. * * Normally, accessing effective_mems would require the cpuset_mutex * or callback_lock - but the correctness of this information is stale * immediately after the query anyway. We do not acquire the lock * during this process to save lock contention in exchange for racing * against mems_allowed rebinds. */ cs = container_of(css, struct cpuset, css); nodes_copy(*mask, cs->effective_mems); css_put(css); } /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), * and if the memory allocation used cpuset_mem_spread_node() * to determine on which node to start looking, as it will for * certain page cache or slab cache pages such as used for file * system buffers and inode caches, then instead of starting on the * local node to look for a free page, rather spread the starting * node around the tasks mems_allowed nodes. * * We don't have to worry about the returned node being offline * because "it can't happen", and even if it did, it would be ok. * * The routines calling guarantee_online_mems() are careful to * only set nodes in task->mems_allowed that are online. So it * should not be possible for the following code to return an * offline node. But if it did, that would be ok, as this routine * is not returning the node where the allocation must be, only * the node where the search should start. The zonelist passed to * __alloc_pages() will include all nodes. If the slab allocator * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ static int cpuset_spread_node(int *rotor) { return *rotor = next_node_in(*rotor, current->mems_allowed); } /** * cpuset_mem_spread_node() - On which node to begin search for a file page */ int cpuset_mem_spread_node(void) { if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) current->cpuset_mem_spread_rotor = node_random(¤t->mems_allowed); return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. * * Description: Return true if @tsk1's mems_allowed intersects the * mems_allowed of @tsk2. Used by the OOM killer to determine if * one of the task's memory usage might impact the memory available * to the other. **/ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } /** * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); cgrp = task_cs(current)->css.cgroup; pr_cont(",cpuset="); pr_cont_cgroup_name(cgrp); pr_cont(",mems_allowed=%*pbl", nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); } /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Mems_allowed:\t%*pb\n", nodemask_pr_args(&task->mems_allowed)); seq_printf(m, "Mems_allowed_list:\t%*pbl\n", nodemask_pr_args(&task->mems_allowed)); } |
| 5 4 4 4 4 4 8 8 8 8 8 8 8 8 15 13 13 2 2 2 2 2 13 13 13 13 13 2 15 15 15 15 15 15 15 15 14 15 15 15 25 27 23 23 23 23 23 22 23 14 15 27 15 22 21 22 22 22 21 22 22 22 26 26 27 27 13 25 13 12 13 13 20 25 21 21 20 22 4 22 22 22 22 22 21 22 22 22 22 22 22 22 22 22 22 22 14 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 9 9 9 9 9 9 9 9 9 9 9 52 52 52 52 52 4 4 4 4 4 4 4 1 1 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2026 Intel Corporation * * utilities for mac80211 */ #include <net/mac80211.h> #include <linux/netdevice.h> #include <linux/export.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/bitmap.h> #include <linux/crc32.h> #include <net/net_namespace.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <kunit/visibility.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" #include "led.h" #include "wep.h" /* privid for wiphys to determine whether they belong to us or not */ const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid; struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) { struct ieee80211_local *local; local = wiphy_priv(wiphy); return &local->hw; } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited = { .mode = IEEE80211_CONN_MODE_EHT, .bw_limit = IEEE80211_CONN_BW_LIMIT_320, }; u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) { __le16 fc = hdr->frame_control; if (ieee80211_is_data(fc)) { if (len < 24) /* drop incorrect hdr len (data) */ return NULL; if (ieee80211_has_a4(fc)) return NULL; if (ieee80211_has_tods(fc)) return hdr->addr1; if (ieee80211_has_fromds(fc)) return hdr->addr2; return hdr->addr3; } if (ieee80211_is_s1g_beacon(fc)) { struct ieee80211_ext *ext = (void *) hdr; return ext->u.s1g_beacon.sa; } if (ieee80211_is_mgmt(fc)) { if (len < 24) /* drop incorrect hdr len (mgmt) */ return NULL; return hdr->addr3; } if (ieee80211_is_ctl(fc)) { if (ieee80211_is_pspoll(fc)) return hdr->addr1; if (ieee80211_is_back_req(fc)) { switch (type) { case NL80211_IFTYPE_STATION: return hdr->addr2; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: return hdr->addr1; default: break; /* fall through to the return */ } } } return NULL; } void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; skb_queue_walk(&tx->skbs, skb) { hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } } int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble) { int dur; /* calculate duration (in microseconds, rounded up to next higher * integer if it includes a fractional microsecond) to send frame of * len bytes (does not include FCS) at the given rate. Duration will * also include SIFS. * * rate is in 100 kbps, so divident is multiplied by 10 in the * DIV_ROUND_UP() operations. */ if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * * N_DBPS = DATARATE x 4 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS) * (16 = SIGNAL time, 6 = tail bits) * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext * * T_SYM = 4 usec * 802.11a - 18.5.2: aSIFSTime = 16 usec * 802.11g - 19.8.4: aSIFSTime = 10 usec + * signal ext = 6 usec */ dur = 16; /* SIFS + signal ext */ dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */ dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */ /* rates should already consider the channel bandwidth, * don't apply divisor again. */ dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10, 4 * rate); /* T_SYM x N_SYM */ } else { /* * 802.11b or 802.11g with 802.11b compatibility: * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime + * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0. * * 802.11 (DS): 15.3.3, 802.11b: 18.3.4 * aSIFSTime = 10 usec * aPreambleLength = 144 usec or 72 usec with short preamble * aPLCPHeaderLength = 48 usec or 24 usec with short preamble */ dur = 10; /* aSIFSTime = 10 usec */ dur += short_preamble ? (72 + 24) : (144 + 48); dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate); } return dur; } /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { struct ieee80211_sub_if_data *sdata; u16 dur; int erp; bool short_preamble = false; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_generic_frame_duration); __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* CTS duration */ dur = ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); /* Data frame duration */ dur += ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_rts_duration); __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* Data frame duration */ dur = ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) { /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); } return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_ctstoself_duration); static void wake_tx_push_queue(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_txq *queue) { struct ieee80211_tx_control control = { .sta = queue->sta, }; struct sk_buff *skb; while (1) { skb = ieee80211_tx_dequeue(&local->hw, queue); if (!skb) break; drv_tx(local, &control, skb); } } /* wake_tx_queue handler for driver not implementing a custom one*/ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct ieee80211_txq *queue; spin_lock(&local->handle_wake_tx_queue_lock); /* Use ieee80211_next_txq() for airtime fairness accounting */ ieee80211_txq_schedule_start(hw, txq->ac); while ((queue = ieee80211_next_txq(hw, txq->ac))) { wake_tx_push_queue(local, sdata, queue); ieee80211_return_txq(hw, queue, false); } ieee80211_txq_schedule_end(hw, txq->ac); spin_unlock(&local->handle_wake_tx_queue_lock); } EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue); static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif *vif = &sdata->vif; struct fq *fq = &local->fq; struct ps_data *ps = NULL; struct txq_info *txqi = NULL; struct sta_info *sta; int i; local_bh_disable(); spin_lock(&fq->lock); if (!test_bit(SDATA_STATE_RUNNING, &sdata->state)) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct ieee80211_txq *txq = sta->sta.txq[i]; struct txq_info *sta_txqi; if (!txq) continue; sta_txqi = to_txq_info(txq); if (ac != txq->ac) continue; if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &sta_txqi->flags)) continue; spin_unlock(&fq->lock); drv_wake_tx_queue(local, sta_txqi); spin_lock(&fq->lock); } } if (vif->txq) { txqi = to_txq_info(vif->txq); /* txq and txq_mgmt are mutually exclusive */ WARN_ON_ONCE(vif->txq_mgmt); if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) || (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) txqi = NULL; } else if (vif->txq_mgmt) { txqi = to_txq_info(vif->txq_mgmt); if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) || ac != vif->txq_mgmt->ac) txqi = NULL; } spin_unlock(&fq->lock); if (txqi) drv_wake_tx_queue(local, txqi); local_bh_enable(); return; out: spin_unlock(&fq->lock); local_bh_enable(); } static void __releases(&local->queue_stop_reason_lock) __acquires(&local->queue_stop_reason_lock) _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) { struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; int i; rcu_read_lock(); if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; for (i = 0; i < local->hw.queues; i++) { if (local->queue_stop_reasons[i]) continue; spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (ac_queue == i || sdata->vif.cab_queue == i) __ieee80211_wake_txqs(sdata, ac); } } spin_lock_irqsave(&local->queue_stop_reason_lock, *flags); } rcu_read_unlock(); } void ieee80211_wake_txqs(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); _ieee80211_wake_txqs(local, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, unsigned long *flags) { struct ieee80211_local *local = hw_to_local(hw); if (WARN_ON(queue >= hw->queues)) return; if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; if (!refcounted) { local->q_stop_reasons[queue][reason] = 0; } else { local->q_stop_reasons[queue][reason]--; if (WARN_ON(local->q_stop_reasons[queue][reason] < 0)) local->q_stop_reasons[queue][reason] = 0; } if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); trace_wake_queue(local, queue, reason, local->q_stop_reasons[queue][reason]); if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ return; if (!skb_queue_empty(&local->pending[queue])) tasklet_schedule(&local->tx_pending_tasklet); /* * Calling _ieee80211_wake_txqs here can be a problem because it may * release queue_stop_reason_lock which has been taken by * __ieee80211_wake_queue's caller. It is certainly not very nice to * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) tasklet_schedule(&local->wake_txqs_tasklet); else _ieee80211_wake_txqs(local, flags); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) { ieee80211_wake_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queue); static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); if (WARN_ON(queue >= hw->queues)) return; if (!refcounted) local->q_stop_reasons[queue][reason] = 1; else local->q_stop_reasons[queue][reason]++; trace_stop_queue(local, queue, reason, local->q_stop_reasons[queue][reason]); set_bit(reason, &local->queue_stop_reasons[queue]); } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) { ieee80211_stop_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_hw *hw = &local->hw; unsigned long flags; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int queue = info->hw_queue; if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); return; } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs) { struct ieee80211_hw *hw = &local->hw; struct sk_buff *skb; unsigned long flags; int queue, i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); while ((skb = skb_dequeue(skbs))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } queue = info->hw_queue; __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); } for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_stop_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues(struct ieee80211_hw *hw) { ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int ret; if (WARN_ON(queue >= hw->queues)) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER, &local->queue_stop_reasons[queue]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return ret; } EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_wake_queue(hw, i, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queues(struct ieee80211_hw *hw) { ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queues); unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { unsigned int queues; if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->vif.hw_queue[ac] != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.hw_queue[ac]); if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.cab_queue); } else { /* all queues */ queues = BIT(local->hw.queues) - 1; } return queues; } void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { if (!local->ops->flush && !drop) return; /* * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); if (drop) { struct sta_info *sta; /* Purge the queues, so the frames on them won't be * sent during __ieee80211_wake_queue() */ list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; ieee80211_purge_sta_txqs(sta); } } if (local->ops->flush) drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); } void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop) { __ieee80211_flush_queues(local, sdata, 0, drop); } static void __iterate_interfaces(struct ieee80211_local *local, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_sub_if_data *sdata; bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; list_for_each_entry_rcu(sdata, &local->interfaces, list, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: continue; default: break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if (ieee80211_sdata_running(sdata) || !active_only) iterator(data, sdata->vif.addr, &sdata->vif); } sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); } void ieee80211_iterate_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); mutex_lock(&local->iflist_mtx); __iterate_interfaces(local, iter_flags, iterator, data); mutex_unlock(&local->iflist_mtx); } EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces); void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); struct ieee80211_vif * __ieee80211_iterate_interfaces(struct ieee80211_hw *hw, struct ieee80211_vif *prev, u32 iter_flags) { bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; struct ieee80211_sub_if_data *sdata = NULL, *monitor; struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); if (prev) sdata = vif_to_sdata(prev); monitor = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&hw->wiphy->mtx)); if (monitor && monitor == sdata) return NULL; sdata = list_prepare_entry(sdata, &local->interfaces, list); list_for_each_entry_continue(sdata, &local->interfaces, list) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: continue; default: break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if (ieee80211_sdata_running(sdata) || !active_only) return &sdata->vif; } if (monitor && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || monitor->flags & IEEE80211_SDATA_IN_DRIVER)) return &monitor->vif; return NULL; } EXPORT_SYMBOL_GPL(__ieee80211_iterate_interfaces); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct sta_info *sta; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (!sta->uploaded) continue; iterator(data, &sta->sta); } } void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_stations(local, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); struct ieee80211_sta * __ieee80211_iterate_stations(struct ieee80211_hw *hw, struct ieee80211_sta *prev) { struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta = NULL; lockdep_assert_wiphy(local->hw.wiphy); if (prev) sta = container_of(prev, struct sta_info, sta); sta = list_prepare_entry(sta, &local->sta_list, list); list_for_each_entry_continue(sta, &local->sta_list, list) { if (!sta->uploaded) continue; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(__ieee80211_iterate_stations); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return NULL; return &sdata->vif; } EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { if (!vif) return NULL; return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. Since we can't check each caller * of this function if we are already quiescing / suspended, * check here and don't WARN since this can actually happen when * the rx path (for example) is racing against __ieee80211_suspend * and suspending / quiescing was set after the rx path checked * them. */ static bool ieee80211_can_queue_work(struct ieee80211_local *local) { if (local->quiescing || (local->suspended && !local->resuming)) { pr_warn("queueing ieee80211 work while going to suspend\n"); return false; } return true; } void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_work(local->workqueue, work); } EXPORT_SYMBOL(ieee80211_queue_work); void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_delayed_work(local->workqueue, dwork, delay); } EXPORT_SYMBOL(ieee80211_queue_delayed_work); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac) { struct ieee80211_chanctx_conf *chanctx_conf; const struct ieee80211_reg_rule *rrule; const struct ieee80211_wmm_ac *wmm_ac; u16 center_freq = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_STATION) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) center_freq = chanctx_conf->def.chan->center_freq; if (!center_freq) { rcu_read_unlock(); return; } rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) { rcu_read_unlock(); return; } if (sdata->vif.type == NL80211_IFTYPE_AP) wmm_ac = &rrule->wmm_rule.ap[ac]; else wmm_ac = &rrule->wmm_rule.client[ac]; qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32); rcu_read_unlock(); } void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_tx_queue_params qparam; struct ieee80211_chanctx_conf *chanctx_conf; int ac; bool use_11b; bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */ int aCWmin, aCWmax; if (!local->ops->conf_tx) return; if (local->hw.queues < IEEE80211_NUM_ACS) return; memset(&qparam, 0, sizeof(qparam)); rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); use_11b = (chanctx_conf && chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !link->operating_11g_mode; rcu_read_unlock(); is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB); /* Set defaults according to 802.11-2007 Table 7-37 */ aCWmax = 1023; if (use_11b) aCWmin = 31; else aCWmin = 15; /* Configure old 802.11b/g medium access rules. */ qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; qparam.aifs = 2; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { /* Update if QoS is enabled. */ if (enable_qos) { switch (ac) { case IEEE80211_AC_BK: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 9; else qparam.aifs = 7; break; /* never happens but let's not leave undefined */ default: case IEEE80211_AC_BE: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 6; else qparam.aifs = 3; break; case IEEE80211_AC_VI: qparam.cw_max = aCWmin; qparam.cw_min = (aCWmin + 1) / 2 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 6016/32; else qparam.txop = 3008/32; if (is_ocb) qparam.aifs = 3; else qparam.aifs = 2; break; case IEEE80211_AC_VO: qparam.cw_max = (aCWmin + 1) / 2 - 1; qparam.cw_min = (aCWmin + 1) / 4 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 3264/32; else qparam.txop = 1504/32; qparam.aifs = 2; break; } } ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; link->tx_conf[ac] = qparam; drv_conf_tx(local, link, ac, &qparam); } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) { link->conf->qos = enable_qos; if (bss_notify) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool multi_link = ieee80211_vif_is_mld(&sdata->vif); struct { u8 id; u8 len; u8 ext_id; struct ieee80211_multi_link_elem ml; struct ieee80211_mle_basic_common_info basic; } __packed mle = { .id = WLAN_EID_EXTENSION, .len = sizeof(mle) - 2, .ext_id = WLAN_EID_EXT_EHT_MULTI_LINK, .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC), .basic.len = sizeof(mle.basic), }; bool add_mle; int err; add_mle = (multi_link && !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, extra, extra_len)); /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN + add_mle * sizeof(mle)); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = skb_put_zero(skb, 24 + 6); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH); memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); if (add_mle) { memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); skb_put_data(skb, &mle, sizeof(mle)); } if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); if (WARN_ON(err)) { kfree_skb(skb); return; } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | tx_flags; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt = (void *)frame_buf; /* build frame */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); mgmt->duration = 0; /* initialize only */ mgmt->seq_ctrl = 0; /* initialize only */ memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); /* u.deauth.reason_code == u.disassoc.reason_code */ mgmt->u.deauth.reason_code = cpu_to_le16(reason); if (send_frame) { skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_DEAUTH_FRAME_LEN); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); /* copy in frame */ skb_put_data(skb, mgmt, IEEE80211_DEAUTH_FRAME_LEN); if (sdata->vif.type != NL80211_IFTYPE_STATION || !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } } static int ieee80211_put_s1g_cap(struct sk_buff *skb, struct ieee80211_sta_s1g_cap *s1g_cap) { if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_s1g_cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_S1G_CAPABILITIES); skb_put_u8(skb, sizeof(struct ieee80211_s1g_cap)); skb_put_data(skb, &s1g_cap->cap, sizeof(s1g_cap->cap)); skb_put_data(skb, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs)); return 0; } static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const u8 *ie, size_t ie_len, size_t *offset, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, u32 flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; int i, err; size_t noffset; bool have_80mhz = false; *offset = 0; sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband)) return 0; /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; /* insert "request information" if in custom IEs */ if (ie && ie_len) { static const u8 before_extrates[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_REQUEST, }; noffset = ieee80211_ie_split(ie, ie_len, before_extrates, ARRAY_SIZE(before_extrates), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (skb_tailroom(skb) < 3) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_DS_PARAMS); skb_put_u8(skb, 1); skb_put_u8(skb, ieee80211_frequency_to_channel(chandef->chan->center_freq)); } if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) return 0; /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (sband->ht_cap.ht_supported) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap)); ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); } /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } /* Check if any channel in this sband supports at least 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (sband->vht_cap.vht_supported && have_80mhz) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); } /* insert custom IEs that go before HE */ if (ie && ie_len) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, WLAN_EID_AP_CSN, /* TODO: add 11ah/11aj/11ak elements */ }; noffset = ieee80211_ie_split(ie, ie_len, before_he, ARRAY_SIZE(before_he), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE)) { err = ieee80211_put_he_cap(skb, sdata, sband, NULL); if (err) return err; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE | IEEE80211_CHAN_NO_EHT)) { err = ieee80211_put_eht_cap(skb, sdata, sband, NULL); if (err) return err; } err = ieee80211_put_he_6ghz_cap(skb, sdata, IEEE80211_SMPS_OFF); if (err) return err; if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_UHR)) { err = ieee80211_put_uhr_cap(skb, sdata, sband); if (err) return err; } /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. */ return 0; } static int ieee80211_put_preq_ies(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { size_t custom_ie_offset = 0; int i, err; memset(ie_desc, 0, sizeof(*ie_desc)); for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { ie_desc->ies[i] = skb_tail_pointer(skb); err = ieee80211_put_preq_ies_band(skb, sdata, ie, ie_len, &custom_ie_offset, i, rate_masks[i], chandef, flags); if (err) return err; ie_desc->len[i] = skb_tail_pointer(skb) - ie_desc->ies[i]; } } /* add any remaining custom IEs */ if (ie && ie_len) { if (WARN_ONCE(skb_tailroom(skb) < ie_len - custom_ie_offset, "not enough space for preq custom IEs\n")) return -ENOBUFS; ie_desc->common_ies = skb_tail_pointer(skb); skb_put_data(skb, ie + custom_ie_offset, ie_len - custom_ie_offset); ie_desc->common_ie_len = skb_tail_pointer(skb) - ie_desc->common_ies; } return 0; }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { struct sk_buff *skb = alloc_skb(buffer_len, GFP_KERNEL); uintptr_t offs; int ret, i; u8 *start; if (!skb) return -ENOMEM; start = skb_tail_pointer(skb); memset(start, 0, skb_tailroom(skb)); ret = ieee80211_put_preq_ies(skb, sdata, ie_desc, ie, ie_len, bands_used, rate_masks, chandef, flags); if (ret < 0) { goto out; } if (skb->len > buffer_len) { ret = -ENOBUFS; goto out; } memcpy(buffer, start, skb->len); /* adjust ie_desc for copy */ for (i = 0; i < NUM_NL80211_BANDS; i++) { offs = ie_desc->ies[i] - start; ie_desc->ies[i] = buffer + offs; } offs = ie_desc->common_ies - start; ie_desc->common_ies = buffer + offs; ret = skb->len; out: consume_skb(skb); return ret; } struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* * Do not send DS Channel parameter for directed probe requests * in order to maximize the chance that we get a response. Some * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chanreq.oper.width; if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len, local->scan_ies_len + ie_len); if (!skb) return NULL; rate_masks[chan->band] = ratemask; ieee80211_put_preq_ies(skb, sdata, &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); if (dst) { mgmt = (struct ieee80211_mgmt *) skb->data; memcpy(mgmt->da, dst, ETH_ALEN); memcpy(mgmt->bssid, dst, ETH_ALEN); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; return skb; } u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; u32 supp_rates; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + elems->ext_supp_rates_len; i++) { u8 rate = 0; int own_rate; bool is_basic; if (i < elems->supp_rates_len) rate = elems->supp_rates[i]; else if (elems->ext_supp_rates) rate = elems->ext_supp_rates [i - elems->supp_rates_len]; own_rate = 5 * (rate & 0x7f); is_basic = !!(rate & 0x80); if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY) continue; for (j = 0; j < num_rates; j++) { int brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); if (basic_rates && is_basic) *basic_rates |= BIT(j); } } } return supp_rates; } void ieee80211_stop_device(struct ieee80211_local *local, bool suspend) { local_bh_disable(); ieee80211_handle_queued_frames(local); local_bh_enable(); ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); wiphy_work_cancel(local->hw.wiphy, &local->reconfig_filter); flush_workqueue(local->workqueue); wiphy_work_flush(local->hw.wiphy, NULL); drv_stop(local, suspend); } static void ieee80211_flush_completed_scan(struct ieee80211_local *local, bool aborted) { /* It's possible that we don't handle the scan completion in * time during suspend, so if it's still marked as completed * here, queue the work and flush it to clean things up. * Instead of calling the worker function directly here, we * really queue it to avoid potential races with other flows * scheduling the same work. */ if (test_bit(SCAN_COMPLETED, &local->scanning)) { /* If coming from reconfiguration failure, abort the scan so * we don't attempt to continue a partial HW scan - which is * possible otherwise if (e.g.) the 2.4 GHz portion was the * completed scan, and a 5 GHz portion is still pending. */ if (aborted) set_bit(SCAN_ABORTED, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work); } } static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); /* * We get here if during resume the device can't be restarted properly. * We might also get here if this happens during HW reset, which is a * slightly different situation and we need to drop all connections in * the latter case. * * Ask cfg80211 to turn off all interfaces, this will result in more * warnings but at least we'll then get into a clean stopped state. */ local->resuming = false; local->suspended = false; local->in_reconfig = false; local->reconfig_failure = true; ieee80211_flush_completed_scan(local, true); /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state */ ieee80211_sched_scan_end(local); list_for_each_entry(sdata, &local->interfaces, list) sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Mark channel contexts as not being in the driver any more to avoid * removing them from the driver during the shutdown process... */ list_for_each_entry(ctx, &local->chanctx_list, list) ctx->driver_present = false; } static void ieee80211_assign_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (conf) { ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_assign_vif_chanctx(local, sdata, link->conf, ctx); } } static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); /* add STAs back */ list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata != sdata) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state + 1)); } } static int ieee80211_reconfig_nan_offload_de(struct ieee80211_sub_if_data *sdata) { struct cfg80211_nan_func *func, **funcs; int res, id, i = 0; funcs = kzalloc_objs(*funcs, sdata->local->hw.max_nan_de_entries + 1); if (!funcs) return -ENOMEM; /* Add all the functions: * This is a little bit ugly. We need to call a potentially sleeping * callback for each NAN function, so we can't hold the spinlock. */ spin_lock_bh(&sdata->u.nan.de.func_lock); idr_for_each_entry(&sdata->u.nan.de.function_inst_ids, func, id) funcs[i++] = func; spin_unlock_bh(&sdata->u.nan.de.func_lock); for (i = 0; funcs[i]; i++) { res = drv_add_nan_func(sdata->local, sdata, funcs[i]); if (WARN_ON(res)) ieee80211_nan_func_terminated(&sdata->vif, funcs[i]->instance_id, NL80211_NAN_FUNC_TERM_REASON_ERROR, GFP_KERNEL); } kfree(funcs); return res; } static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *ndi_sdata; struct sta_info *sta; int res; res = drv_start_nan(local, sdata, &sdata->u.nan.conf); if (WARN_ON(res)) return res; if (!(sdata->local->hw.wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE)) return ieee80211_reconfig_nan_offload_de(sdata); drv_vif_cfg_changed(sdata->local, sdata, BSS_CHANGED_NAN_LOCAL_SCHED); /* Now we can add all the NDIs to the driver */ list_for_each_entry(ndi_sdata, &local->interfaces, list) { if (ndi_sdata->vif.type == NL80211_IFTYPE_NAN_DATA) { res = drv_add_interface(local, ndi_sdata); if (WARN_ON(res)) return res; } } /* Add NMI stations (stations on the NAN interface) */ list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata != sdata) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { res = drv_sta_state(local, sdata, sta, state, state + 1); if (WARN_ON(res)) return res; } /* Add peer schedules for NMI stations that have them */ if (!sta->sta.nan_sched) continue; res = drv_nan_peer_sched_changed(local, sdata, sta); if (WARN_ON(res)) return res; } /* Add NDI stations (stations on NAN_DATA interfaces) */ list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata->vif.type != NL80211_IFTYPE_NAN_DATA) continue; if (WARN_ON(!sta->sta.nmi)) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { res = drv_sta_state(local, sta->sdata, sta, state, state + 1); if (WARN_ON(res)) return res; } } return 0; } static void ieee80211_reconfig_ap_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed) { int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!(sdata->vif.active_links & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (rcu_access_pointer(link->u.ap.beacon)) drv_start_ap(local, sdata, link->conf); if (!link->conf->enable_beacon) continue; changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_link_info_change_notify(sdata, link, changed); } } int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; struct sta_info *sta; int res, i; bool reconfig_due_to_wowlan = false; struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; lockdep_assert_wiphy(local->hw.wiphy); /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM if (suspended) local->resuming = true; if (local->wowlan) { /* * In the wowlan case, both mac80211 and the device * are functional when the resume op is called, so * clear local->suspended so the device could operate * normally (e.g. pass rx frames). */ local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { local->resuming = false; return res; } if (res == 0) goto wake_up; WARN_ON(res > 1); /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; local->suspended = true; } #endif /* * In case of hw_restart during suspend (without wowlan), * cancel restart work, as we are reconfiguring the device * anyway. * Note that restart_work is scheduled on a frozen workqueue, * so we can't deadlock in this case. */ if (suspended && local->in_reconfig && !reconfig_due_to_wowlan) cancel_work_sync(&local->restart_work); local->started = false; /* * Upon resume hardware can sometimes be goofy due to * various platform / driver / bus issues, so restarting * the device may at times not work immediately. Propagate * the error. */ res = drv_start(local); if (res) { if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); ieee80211_handle_reconfig_failure(local); return res; } /* setup fragmentation threshold */ drv_set_frag_threshold(local, -1, hw->wiphy->frag_threshold); /* setup RTS threshold */ if (hw->wiphy->n_radio > 0) { for (i = 0; i < hw->wiphy->n_radio; i++) { u32 rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold; drv_set_rts_threshold(local, i, rts_threshold); } } else { drv_set_rts_threshold(local, -1, hw->wiphy->rts_threshold); } /* reset coverage class */ drv_set_coverage_class(local, -1, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { RCU_INIT_POINTER(local->monitor_sdata, NULL); synchronize_net(); kfree(sdata); } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; /* These vifs can't be added before NAN was started */ if (sdata->vif.type == NL80211_IFTYPE_NAN_DATA) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) break; } } /* If adding any of the interfaces failed above, roll back and * report failure. */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type == NL80211_IFTYPE_NAN_DATA) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); } ieee80211_handle_reconfig_failure(local); return res; } /* add channel contexts */ list_for_each_entry(ctx, &local->chanctx_list, list) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) WARN_ON(drv_add_chanctx(local, ctx)); sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | IEEE80211_CONF_CHANGE_MONITOR | IEEE80211_CONF_CHANGE_PS | IEEE80211_CONF_CHANGE_RETRY_LIMITS | IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); /* Finally also reconfigure all the BSS information */ list_for_each_entry(sdata, &local->interfaces, list) { /* common change flags for all interface types - link only */ u64 changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | BSS_CHANGED_ERP_SLOT | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT | BSS_CHANGED_BSSID | BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_TXPOWER | BSS_CHANGED_MCAST_RATE; struct ieee80211_link_data *link = NULL; unsigned int link_id; u32 active_links = 0; if (!ieee80211_sdata_running(sdata)) continue; if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = { [0] = &sdata->vif.bss_conf, }; if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* start with a single active link */ active_links = sdata->vif.active_links; link_id = ffs(active_links) - 1; sdata->vif.active_links = BIT(link_id); } drv_change_vif_links(local, sdata, 0, sdata->vif.active_links, old); } sdata->restart_active_links = active_links; for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_assign_chanctx(local, sdata, link); } switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: break; case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_NAN_DATA: /* NAN stations are handled later */ break; case NL80211_IFTYPE_ADHOC: if (sdata->vif.cfg.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); fallthrough; default: ieee80211_reconfig_stations(sdata); fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, &sdata->deflink, i, &sdata->deflink.tx_conf[i]); break; } if (sdata->vif.bss_conf.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_IDLE; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_ASSOC | BSS_CHANGED_ARP_FILTER | BSS_CHANGED_PS; /* Re-send beacon info report to the driver */ if (sdata->deflink.u.mgd.have_beacon) changed |= BSS_CHANGED_BEACON_INFO; if (sdata->vif.bss_conf.max_idle_period || sdata->vif.bss_conf.protected_keep_alive) changed |= BSS_CHANGED_KEEP_ALIVE; ieee80211_bss_info_change_notify(sdata, changed); } else if (!WARN_ON(!link)) { ieee80211_link_info_change_notify(sdata, link, changed); changed = BSS_CHANGED_ASSOC | BSS_CHANGED_IDLE | BSS_CHANGED_PS | BSS_CHANGED_ARP_FILTER; ieee80211_vif_cfg_change_notify(sdata, changed); } break; case NL80211_IFTYPE_OCB: changed |= BSS_CHANGED_OCB; ieee80211_bss_info_change_notify(sdata, changed); break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_P2P_PS; if (ieee80211_vif_is_mld(&sdata->vif)) ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); else changed |= BSS_CHANGED_SSID; if (sdata->vif.bss_conf.ftm_responder == 1 && wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) changed |= BSS_CHANGED_FTM_RESPONDER; if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; if (ieee80211_vif_is_mld(&sdata->vif)) { ieee80211_reconfig_ap_links(local, sdata, changed); break; } if (rcu_access_pointer(sdata->deflink.u.ap.beacon)) drv_start_ap(local, sdata, sdata->deflink.conf); } fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_bss_info_change_notify(sdata, changed); } break; case NL80211_IFTYPE_NAN: res = ieee80211_reconfig_nan(sdata); if (res < 0) { ieee80211_handle_reconfig_failure(local); return res; } break; case NL80211_IFTYPE_NAN_DATA: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_WDS: WARN_ON(1); break; } } ieee80211_recalc_ps(local); /* * The sta might be in psm against the ap (e.g. because * this was the state before a hw restart), so we * explicitly send a null packet in order to make sure * it'll sync against the ap (and get out of psm). */ if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; if (!sdata->u.mgd.associated) continue; ieee80211_send_nullfunc(local, sdata, false); } } /* APs are now beaconing, add back stations */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: ieee80211_reconfig_stations(sdata); break; default: break; } } /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); /* re-enable multi-link for client interfaces */ list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->restart_active_links) ieee80211_set_active_links(&sdata->vif, sdata->restart_active_links); /* * If a link switch was scheduled before the restart, and ran * before reconfig, it will do nothing, so re-schedule. */ if (sdata->desired_active_links) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->activate_links_work); } /* Reconfigure sched scan if it was interrupted by FW restart */ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); sched_scan_req = rcu_dereference_protected(local->sched_scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata && sched_scan_req) /* * Sched scan stopped, but we don't want to report it. Instead, * we're trying to reschedule. However, if more than one scan * plan was set, we cannot reschedule since we don't know which * scan plan was currently running (and some scan plans may have * already finished). */ if (sched_scan_req->n_scan_plans > 1 || __ieee80211_request_sched_scan_start(sched_scan_sdata, sched_scan_req)) { RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_stopped = true; } if (sched_scan_stopped) cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. * * Also tear down aggregation sessions since reconfiguring * them in a hardware restart scenario is not easily done * right now, and the hardware will have lost information * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { list_for_each_entry(sta, &local->sta_list, list) { if (!local->resuming) ieee80211_sta_tear_down_BA_sessions( sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } } /* * If this is for hw restart things are still running. * We may want to change that later, however. */ if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); if (local->in_reconfig) { in_reconfig = local->in_reconfig; local->in_reconfig = false; barrier(); ieee80211_reconfig_roc(local); /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(local->hw.wiphy, &sdata->work); } } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); if (in_reconfig) { list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } } /* Passing NULL means an interface is picked for configuration */ if (local->virt_monitors > 0 && local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local, NULL); if (!suspended) return 0; #ifdef CONFIG_PM /* first set suspended false, then resuming */ local->suspended = false; mb(); local->resuming = false; ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } mod_timer(&local->sta_cleanup, jiffies + 1); #else WARN_ON(1); #endif return 0; } static void ieee80211_reconfig_disconnect(struct ieee80211_vif *vif, u8 flag) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; struct ieee80211_key *key; if (WARN_ON(!vif)) return; sdata = vif_to_sdata(vif); local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_RESUME && !local->resuming)) return; if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_HW_RESTART && !local->in_reconfig)) return; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return; sdata->flags |= flag; list_for_each_entry(key, &sdata->key_list, list) key->flags |= KEY_FLAG_TAINTED; } void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_HW_RESTART); } EXPORT_SYMBOL_GPL(ieee80211_hw_restart_disconnect); void ieee80211_resume_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_RESUME); } EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; lockdep_assert_wiphy(local->hw.wiphy); chanctx_conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * This function can be called from a work, thus it may be possible * that the chanctx_conf is removed (due to a disconnection, for * example). * So nothing should be done in such case. */ if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_smps_chanctx(local, chanctx); } void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, int link_id) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; int i; lockdep_assert_wiphy(local->hw.wiphy); for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *bss_conf; if (link_id >= 0 && link_id != i) continue; rcu_read_lock(); bss_conf = rcu_dereference(sdata->vif.link_conf[i]); if (!bss_conf) { rcu_read_unlock(); continue; } chanctx_conf = rcu_dereference_protected(bss_conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * Since we hold the wiphy mutex (checked above) * we can take the chanctx_conf pointer out of the * RCU critical section, it cannot go away without * the mutex. Just the way we reached it could - in * theory - go away, but we don't really care and * it really shouldn't happen anyway. */ rcu_read_unlock(); if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_min_def(local, chanctx); } } size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) { size_t pos = offset; while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) pos += 2 + ies[pos + 1]; return pos; } u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap) { __le16 tmp; *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); /* capability flags */ tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); /* AMPDU parameters */ *pos++ = ht_cap->ampdu_factor | (ht_cap->ampdu_density << IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); /* MCS set */ memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs)); pos += sizeof(ht_cap->mcs); /* extended capabilities */ pos += sizeof(__le16); /* BF capabilities */ pos += sizeof(__le32); /* antenna selection */ pos += sizeof(u8); return pos; } u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap) { __le32 tmp; *pos++ = WLAN_EID_VHT_CAPABILITY; *pos++ = sizeof(struct ieee80211_vht_cap); memset(pos, 0, sizeof(struct ieee80211_vht_cap)); /* capability flags */ tmp = cpu_to_le32(cap); memcpy(pos, &tmp, sizeof(u32)); pos += sizeof(u32); /* VHT MCS set */ memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs)); pos += sizeof(vht_cap->vht_mcs); return pos; } /* this may return more than ieee80211_put_he_6ghz_cap() will need */ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); return 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); } static void ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn, const struct ieee80211_sta_he_cap *he_cap, struct ieee80211_he_cap_elem *elem) { u8 ru_limit, max_ru; *elem = he_cap->he_cap_elem; switch (conn->bw_limit) { case IEEE80211_CONN_BW_LIMIT_20: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242; break; case IEEE80211_CONN_BW_LIMIT_40: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484; break; case IEEE80211_CONN_BW_LIMIT_80: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996; break; default: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996; break; } max_ru = elem->phy_cap_info[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; max_ru = min(max_ru, ru_limit); elem->phy_cap_info[8] &= ~IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; elem->phy_cap_info[8] |= max_ru; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G); elem->phy_cap_info[9] &= ~IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G); elem->phy_cap_info[5] &= ~IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK; elem->phy_cap_info[7] &= ~(IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ | IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ); } } int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_he_cap_elem elem; u8 *len; u8 n; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; /* modify on stack first to calculate 'n' and 'ie_len' correctly */ ieee80211_get_adjusted_he_cap(conn, he_cap, &elem); n = ieee80211_he_mcs_nss_size(&elem); ie_len = 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); /* We'll set the size later below */ skb_put_u8(skb, WLAN_EID_EXT_HE_CAPABILITY); /* Fixed data */ skb_put_data(skb, &elem, sizeof(elem)); skb_put_data(skb, &he_cap->he_mcs_nss_supp, n); /* Check if PPE Threshold should be present */ if ((he_cap->he_cap_elem.phy_cap_info[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) goto end; /* * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) */ n = hweight8(he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> IEEE80211_PPE_THRES_NSS_POS)); /* * Each pair is 6 bits, and we need to add the 7 "header" bits to the * total size. */ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; n = DIV_ROUND_UP(n, 8); /* Copy PPE Thresholds */ skb_put_data(skb, &he_cap->ppe_thres, n); end: *len = skb_tail_pointer(skb) - len - 1; return 0; } int ieee80211_put_reg_conn(struct sk_buff *skb, enum ieee80211_channel_flags flags) { u8 reg_conn = IEEE80211_REG_CONN_LPI_VALID | IEEE80211_REG_CONN_LPI_VALUE | IEEE80211_REG_CONN_SP_VALID; if (!(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT)) reg_conn |= IEEE80211_REG_CONN_SP_VALUE; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, 1 + sizeof(reg_conn)); skb_put_u8(skb, WLAN_EID_EXT_NON_AP_STA_REG_CON); skb_put_u8(skb, reg_conn); return 0; } int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode) { struct ieee80211_supported_band *sband; const struct ieee80211_sband_iftype_data *iftd; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap; if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) return 0; sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (!iftd) return 0; /* Check for device HE 6 GHz capability before adding element */ if (!iftd->he_6ghz_capa.capa) return 0; cap = iftd->he_6ghz_capa.capa; cap &= cpu_to_le16(~IEEE80211_HE_6GHZ_CAP_SM_PS); switch (smps_mode) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_STATIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_DYNAMIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; } if (skb_tailroom(skb) < 2 + 1 + sizeof(cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, 1 + sizeof(cap)); skb_put_u8(skb, WLAN_EID_EXT_HE_6GHZ_CAPA); skb_put_data(skb, &cap, sizeof(cap)); return 0; } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) { struct ieee80211_ht_operation *ht_oper; /* Build HT Information */ *pos++ = WLAN_EID_HT_OPERATION; *pos++ = sizeof(struct ieee80211_ht_operation); ht_oper = (struct ieee80211_ht_operation *)pos; ht_oper->primary_chan = ieee80211_frequency_to_channel( chandef->chan->center_freq); switch (chandef->width) { case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_40: if (chandef->center_freq1 > chandef->chan->center_freq) ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW; break; case NL80211_CHAN_WIDTH_320: /* HT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; break; } if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_20) ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY; if (rifs_mode) ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE; ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x0000; /* It seems that Basic MCS set and Supported MCS set are identical for the first 10 bytes */ memset(&ht_oper->basic_set, 0, 16); memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10); return pos + sizeof(struct ieee80211_ht_operation); } void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef) { *pos++ = WLAN_EID_WIDE_BW_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ /* New channel width */ switch (chandef->width) { case NL80211_CHAN_WIDTH_80: *pos++ = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_160: *pos++ = IEEE80211_VHT_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80P80: *pos++ = IEEE80211_VHT_CHANWIDTH_80P80MHZ; break; case NL80211_CHAN_WIDTH_320: /* The behavior is not defined for 320 MHz channels */ WARN_ON(1); fallthrough; default: *pos++ = IEEE80211_VHT_CHANWIDTH_USE_HT; } /* new center frequency segment 0 */ *pos++ = ieee80211_frequency_to_channel(chandef->center_freq1); /* new center frequency segment 1 */ if (chandef->center_freq2) *pos++ = ieee80211_frequency_to_channel(chandef->center_freq2); else *pos++ = 0; } u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef) { struct ieee80211_vht_operation *vht_oper; *pos++ = WLAN_EID_VHT_OPERATION; *pos++ = sizeof(struct ieee80211_vht_operation); vht_oper = (struct ieee80211_vht_operation *)pos; vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel( chandef->center_freq1); if (chandef->center_freq2) vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(chandef->center_freq2); else vht_oper->center_freq_seg1_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: /* * Convert 160 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx; if (chandef->chan->center_freq < chandef->center_freq1) vht_oper->center_freq_seg0_idx -= 8; else vht_oper->center_freq_seg0_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: /* * Convert 80+80 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_320: /* VHT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } /* don't require special VHT peer rates */ vht_oper->basic_mcs_set = cpu_to_le16(0xffff); return pos + sizeof(struct ieee80211_vht_operation); } u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; struct cfg80211_chan_def he_chandef; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); if (chandef->chan->band == NL80211_BAND_6GHZ) ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; he_oper_params |= u32_encode_bits(1023, /* disabled */ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (chandef->chan->band == NL80211_BAND_6GHZ) he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); pos += sizeof(struct ieee80211_he_operation); if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; cfg80211_chandef_create(&he_chandef, chandef->chan, NL80211_CHAN_NO_HT); he_chandef.center_freq1 = chandef->center_freq1; he_chandef.center_freq2 = chandef->center_freq2; he_chandef.width = chandef->width; /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = ieee80211_frequency_to_channel(he_chandef.chan->center_freq); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(he_chandef.center_freq1); if (he_chandef.center_freq2) he_6ghz_op->ccfs1 = ieee80211_frequency_to_channel(he_chandef.center_freq2); else he_6ghz_op->ccfs1 = 0; switch (he_chandef.width) { case NL80211_CHAN_WIDTH_320: /* Downgrade EHT 320 MHz BW to 160 MHz for HE and set new * center_freq1 */ ieee80211_chandef_downgrade(&he_chandef, NULL); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(he_chandef.center_freq1); fallthrough; case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. */ he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; if (he_chandef.chan->center_freq < he_chandef.center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } pos += sizeof(struct ieee80211_he_6ghz_oper); out: return pos; } u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap) { const struct ieee80211_eht_mcs_nss_supp_20mhz_only *eht_mcs_nss = &eht_cap->eht_mcs_nss_supp.only_20mhz; struct ieee80211_eht_operation *eht_oper; struct ieee80211_eht_operation_info *eht_oper_info; u8 eht_oper_len = offsetof(struct ieee80211_eht_operation, optional); u8 eht_oper_info_len = offsetof(struct ieee80211_eht_operation_info, optional); u8 chan_width = 0; *pos++ = WLAN_EID_EXTENSION; *pos++ = 1 + eht_oper_len + eht_oper_info_len; *pos++ = WLAN_EID_EXT_EHT_OPERATION; eht_oper = (struct ieee80211_eht_operation *)pos; memcpy(&eht_oper->basic_mcs_nss, eht_mcs_nss, sizeof(*eht_mcs_nss)); eht_oper->params |= IEEE80211_EHT_OPER_INFO_PRESENT; pos += eht_oper_len; eht_oper_info = (struct ieee80211_eht_operation_info *)eht_oper->optional; eht_oper_info->ccfs0 = ieee80211_frequency_to_channel(chandef->center_freq1); if (chandef->center_freq2) eht_oper_info->ccfs1 = ieee80211_frequency_to_channel(chandef->center_freq2); else eht_oper_info->ccfs1 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_320: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 16; else eht_oper_info->ccfs0 += 16; break; case NL80211_CHAN_WIDTH_160: eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 8; else eht_oper_info->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ; break; default: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ; break; } eht_oper_info->control = chan_width; pos += eht_oper_info_len; /* TODO: eht_oper_info->optional */ return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; if (!ht_oper) return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: channel_type = NL80211_CHAN_HT20; break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: channel_type = NL80211_CHAN_HT40PLUS; break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: channel_type = NL80211_CHAN_HT40MINUS; break; default: return false; } cfg80211_chandef_create(chandef, chandef->chan, channel_type); return true; } bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def new = *chandef; int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; u32 vht_cap; bool support_80_80 = false; bool support_160 = false; u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); u8 supp_chwidth = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); support_80_80 = ((vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; ccf0 = ccfs0; /* if not supported, parse as though we didn't understand it */ if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) ext_nss_bw_supp = 0; /* * Cf. IEEE 802.11 Table 9-250 * * We really just consider that because it's inefficient to connect * at a higher bandwidth than we'll actually be able to use. */ switch ((supp_chwidth << 4) | ext_nss_bw_supp) { default: case 0x00: ccf1 = 0; support_160 = false; support_80_80 = false; break; case 0x01: support_80_80 = false; fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; break; case 0x10: ccf1 = ccfs1; break; case 0x11: case 0x12: if (!ccfs1) ccf1 = ccfs2; else ccf1 = ccfs1; break; case 0x13: case 0x20: case 0x23: ccf1 = ccfs1; break; } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: /* just use HT information directly */ break; case IEEE80211_VHT_CHANWIDTH_80MHZ: new.width = NL80211_CHAN_WIDTH_80; new.center_freq1 = cf0; /* If needed, adjust based on the newer interop workaround. */ if (ccf1) { unsigned int diff; diff = abs(ccf1 - ccf0); if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq1 = cf0; new.center_freq2 = cf1; break; default: return false; } if (!cfg80211_chandef_valid(&new)) return false; *chandef = new; return true; } void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, struct cfg80211_chan_def *chandef) { chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs0, chandef->chan->band); switch (u8_get_bits(info->control, IEEE80211_EHT_OPER_CHAN_WIDTH)) { case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ: chandef->width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ: chandef->width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ: chandef->width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ: chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ: chandef->width = NL80211_CHAN_WIDTH_320; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; } } bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; u32 freq; if (chandef->chan->band != NL80211_BAND_6GHZ) return true; if (!he_oper) return false; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return false; /* * The EHT operation IE does not contain the primary channel so the * primary channel frequency should be taken from the 6 GHz operation * information. */ freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, NL80211_BAND_6GHZ); he_chandef.chan = ieee80211_get_channel(local->hw.wiphy, freq); if (!he_chandef.chan) return false; if (!eht_oper || !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) { switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: he_chandef.width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: he_chandef.width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; if (!he_6ghz_oper->ccfs1) break; if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) he_chandef.width = NL80211_CHAN_WIDTH_160; else he_chandef.width = NL80211_CHAN_WIDTH_80P80; break; } if (he_chandef.width == NL80211_CHAN_WIDTH_160) { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } else { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, NL80211_BAND_6GHZ); he_chandef.center_freq2 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } } else { ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &he_chandef); he_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); } if (!cfg80211_chandef_valid(&he_chandef)) return false; *chandef = he_chandef; return true; } bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) { case IEEE80211_S1G_CHANWIDTH_1MHZ: chandef->width = NL80211_CHAN_WIDTH_1; break; case IEEE80211_S1G_CHANWIDTH_2MHZ: chandef->width = NL80211_CHAN_WIDTH_2; break; case IEEE80211_S1G_CHANWIDTH_4MHZ: chandef->width = NL80211_CHAN_WIDTH_4; break; case IEEE80211_S1G_CHANWIDTH_8MHZ: chandef->width = NL80211_CHAN_WIDTH_8; break; case IEEE80211_S1G_CHANWIDTH_16MHZ: chandef->width = NL80211_CHAN_WIDTH_16; break; default: return false; } chandef->s1g_primary_2mhz = false; switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: pri_1mhz_khz = ieee80211_channel_to_freq_khz( oper->primary_ch, NL80211_BAND_S1GHZ); break; case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: chandef->s1g_primary_2mhz = true; pri_2mhz_khz = ieee80211_channel_to_freq_khz( oper->primary_ch, NL80211_BAND_S1GHZ); if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == S1G_2M_PRIMARY_LOCATION_LOWER) pri_1mhz_khz = pri_2mhz_khz - 500; else pri_1mhz_khz = pri_2mhz_khz + 500; break; default: return false; } oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, NL80211_BAND_S1GHZ); chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); chandef->freq1_offset = oper_khz % 1000; chandef->chan = ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, u32 basic_rates, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if (masked_rates & BIT(i)) continue; rates++; } if (element_id == WLAN_EID_SUPP_RATES) { rates = min_t(u8, rates, 8); skip = 0; } else { skip = 8; if (rates <= skip) return 0; rates -= skip; } if (skb_tailroom(skb) < rates + 2) return -ENOBUFS; skb_put_u8(skb, element_id); skb_put_u8(skb, rates); for (i = 0; i < sband->n_bitrates && rates; i++) { int rate; u8 basic; if (masked_rates & BIT(i)) continue; if (skip > 0) { skip--; continue; } basic = basic_rates & BIT(i) ? 0x80 : 0; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); skb_put_u8(skb, basic | (u8)rate); rates--; } WARN(rates > 0, "rates confused: rates:%d, element:%d\n", rates, element_id); return 0; } int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link_data; if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return 0; if (link_id < 0) link_data = &sdata->deflink; else link_data = wiphy_dereference(sdata->local->hw.wiphy, sdata->link[link_id]); if (WARN_ON_ONCE(!link_data)) return -99; return -ewma_beacon_signal_read(&link_data->u.mgd.ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs) { if (!mcs) return 1; /* TODO: consider rx_highest */ if (mcs->rx_mask[3]) return 4; if (mcs->rx_mask[2]) return 3; if (mcs->rx_mask[1]) return 2; return 1; } u64 ieee80211_calculate_rx_timestamp(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset) { u64 ts = status->mactime; bool mactime_plcp_start; struct rate_info ri; u16 rate; u8 n_ltf; if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; mactime_plcp_start = (status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_PLCP_START; memset(&ri, 0, sizeof(ri)); ri.bw = status->bw; /* Fill cfg80211 rate info */ switch (status->encoding) { case RX_ENC_EHT: ri.flags |= RATE_INFO_FLAGS_EHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.eht_ru_alloc = status->eht.ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* TODO/FIXME: is this right? handle other PPDUs */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; } break; case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.he_ru_alloc = status->he_ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11ax_D6.0, section 27.3.4 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * TODO: * For HE MU PPDU, add the HE-SIG-B. * For HE ER PPDU, add 8us for the HE-SIG-A. * For HE TB PPDU, add 4us for the HE-STF. * Add the HE-LTF durations - variable. */ } break; case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 19.3.2 for * HT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; if (status->enc_flags & RX_ENC_FLAG_HT_GF) ts += 24; else ts += 32; /* * Add Data HT-LTFs per streams * TODO: add Extension HT-LTFs, 4us per LTF */ n_ltf = ((ri.mcs >> 3) & 3) + 1; n_ltf = n_ltf == 3 ? 4 : n_ltf; ts += n_ltf * 4; } break; case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 21.3.2 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * Add VHT-LTFs per streams */ n_ltf = (ri.nss != 1) && (ri.nss % 2) ? ri.nss + 1 : ri.nss; ts += 4 * n_ltf; } break; default: WARN_ON(1); fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; sband = hw->wiphy->bands[status->band]; ri.legacy = sband->bitrates[status->rate_idx].bitrate; if (mactime_plcp_start) { if (status->band == NL80211_BAND_5GHZ) { ts += 20; mpdu_offset += 2; } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) { ts += 96; } else { ts += 192; } } break; } } rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", (unsigned long long)status->flag, status->rate_idx, status->nss)) return 0; /* rewind from end of MPDU */ if ((status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_END) ts -= mpdu_len * 8 * 10 / rate; ts += mpdu_offset * 8 * 10 / rate; return ts; } EXPORT_SYMBOL_GPL(ieee80211_calculate_rx_timestamp); /* Cancel CAC for the interfaces under the specified @local. If @ctx is * also provided, only the interfaces using that ctx will be canceled. */ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link; struct ieee80211_chanctx_conf *chanctx_conf; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; chanctx_conf = sdata_dereference(link->conf->chanctx_conf, sdata); if (ctx && &ctx->conf != chanctx_conf) continue; wiphy_hrtimer_work_cancel(local->hw.wiphy, &link->dfs_cac_timer_work); if (!sdata->wdev.links[link_id].cac_started) continue; chandef = link->conf->chanreq.oper; ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } } } void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; struct ieee80211_chanctx *ctx, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(ctx, tmp, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!ctx->radar_detected) continue; ctx->radar_detected = false; chandef = ctx->conf.def; ieee80211_dfs_cac_cancel(local, ctx); cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); } } static void ieee80211_radar_mark_chan_ctx_iterator(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data) { struct ieee80211_chanctx *ctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) return; if (data && data != chanctx_conf) return; ctx->radar_detected = true; } void ieee80211_radar_detected(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_local *local = hw_to_local(hw); trace_api_radar_detected(local); ieee80211_iter_chan_contexts_atomic(hw, ieee80211_radar_mark_chan_ctx_iterator, chanctx_conf); wiphy_work_queue(hw->wiphy, &local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c, struct ieee80211_conn_settings *conn) { enum nl80211_chan_width new_primary_width; struct ieee80211_conn_settings _ignored = {}; /* allow passing NULL if caller doesn't care */ if (!conn) conn = &_ignored; again: /* no-HT indicates nothing to do */ new_primary_width = NL80211_CHAN_WIDTH_20_NOHT; switch (c->width) { default: case NL80211_CHAN_WIDTH_20_NOHT: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20: c->width = NL80211_CHAN_WIDTH_20_NOHT; conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_40: c->width = NL80211_CHAN_WIDTH_20; c->center_freq1 = c->chan->center_freq; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_80: new_primary_width = NL80211_CHAN_WIDTH_40; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_40; break; case NL80211_CHAN_WIDTH_80P80: c->center_freq2 = 0; c->width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_160: new_primary_width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_320: new_primary_width = NL80211_CHAN_WIDTH_160; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; break; case NL80211_CHAN_WIDTH_1: case NL80211_CHAN_WIDTH_2: case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; } if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) { c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width, &c->punctured); c->width = new_primary_width; } /* * With an 80 MHz channel, we might have the puncturing in the primary * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width. * In that case, downgrade again. */ if (!cfg80211_chandef_valid(c) && c->punctured) goto again; WARN_ON_ONCE(!cfg80211_chandef_valid(c)); } int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { int hdr_len = IEEE80211_MIN_ACTION_SIZE(chan_switch); struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; skb = dev_alloc_skb(local->tx_headroom + hdr_len + 5 + /* channel switch announcement element */ 3 + /* secondary channel offset element */ 5 + /* wide bandwidth channel switch announcement */ 8); /* mesh channel switch parameters element */ if (!skb) return -ENOMEM; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); if (ieee80211_vif_is_mesh(&sdata->vif)) { memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; mgmt->u.action.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ *pos++ = csa_settings->block_tx ? 1 : 0; /* CSA mode */ freq = csa_settings->chandef.chan->center_freq; *pos++ = ieee80211_frequency_to_channel(freq); /* channel */ *pos++ = csa_settings->count; /* count */ if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) { enum nl80211_channel_type ch_type; skb_put(skb, 3); *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET; /* EID */ *pos++ = 1; /* IE length */ ch_type = cfg80211_get_chandef_type(&csa_settings->chandef); if (ch_type == NL80211_CHAN_HT40PLUS) *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW; } if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; skb_put(skb, 8); *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; /* EID */ *pos++ = 6; /* IE length */ *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL; /* Mesh TTL */ *pos = 0x00; /* Mesh Flag: Tx Restrict, Initiator, Reason */ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; *pos++ |= csa_settings->block_tx ? WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00; put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */ pos += 2; put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */ pos += 2; } if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_80P80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_160) { skb_put(skb, 5); ieee80211_ie_build_wide_bw_cs(pos, &csa_settings->chandef); } ieee80211_tx_skb(sdata, skb); return 0; } static bool ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) { s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1); int skip; if (end > 0) return false; /* One shot NOA */ if (data->count[i] == 1) return false; if (data->desc[i].interval == 0) return false; /* End time is in the past, check for repetitions */ skip = DIV_ROUND_UP(-end, data->desc[i].interval); if (data->count[i] < 255) { if (data->count[i] <= skip) { data->count[i] = 0; return false; } data->count[i] -= skip; } data->desc[i].start += skip * data->desc[i].interval; return true; } static bool ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf, s32 *offset) { bool ret = false; int i; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 cur; if (!data->count[i]) continue; if (ieee80211_extend_noa_desc(data, tsf + *offset, i)) ret = true; cur = data->desc[i].start - tsf; if (cur > *offset) continue; cur = data->desc[i].start + data->desc[i].duration - tsf; if (cur > *offset) *offset = cur; } return ret; } static u32 ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf) { s32 offset = 0; int tries = 0; /* * arbitrary limit, used to avoid infinite loops when combined NoA * descriptors cover the full time period. */ int max_tries = 5; ieee80211_extend_absent_time(data, tsf, &offset); do { if (!ieee80211_extend_absent_time(data, tsf, &offset)) break; tries++; } while (tries < max_tries); return offset; } void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf) { u32 next_offset = BIT(31) - 1; int i; data->absent = 0; data->has_next_tsf = false; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 start; if (!data->count[i]) continue; ieee80211_extend_noa_desc(data, tsf, i); start = data->desc[i].start - tsf; if (start <= 0) data->absent |= BIT(i); if (next_offset > start) next_offset = start; data->has_next_tsf = true; } if (data->absent) next_offset = ieee80211_get_noa_absent_time(data, tsf); data->next_tsf = tsf + next_offset; } EXPORT_SYMBOL(ieee80211_update_p2p_noa); int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf) { int ret = 0; int i; memset(data, 0, sizeof(*data)); for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i]; if (!desc->count || !desc->duration) continue; data->count[i] = desc->count; data->desc[i].start = le32_to_cpu(desc->start_time); data->desc[i].duration = le32_to_cpu(desc->duration); data->desc[i].interval = le32_to_cpu(desc->interval); if (data->count[i] > 1 && data->desc[i].interval < data->desc[i].duration) continue; ieee80211_extend_noa_desc(data, tsf, i); ret++; } if (ret) ieee80211_update_p2p_noa(data, tsf); return ret; } EXPORT_SYMBOL(ieee80211_parse_p2p_noa); void ieee80211_recalc_dtim(struct ieee80211_sub_if_data *sdata, u64 tsf) { u64 dtim_count = 0; u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; if (tsf == -1ULL || !beacon_int || !dtim_period) return; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!sdata->bss) return; ps = &sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { return; } /* * actually finds last dtim_count, mac80211 will update in * __beacon_add_tim(). * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period */ do_div(tsf, beacon_int); bcns_from_dtim = do_div(tsf, dtim_period); /* just had a DTIM */ if (!bcns_from_dtim) dtim_count = 0; else dtim_count = dtim_period - bcns_from_dtim; ps->dtim_count = dtim_count; } /* * Given a long beacon period, calculate the current index into * that period to determine the number of TSBTTs until the next TBTT. * It is completely valid to have a short beacon period that differs * from the dtim period (i.e a TBTT thats not a DTIM). */ void ieee80211_recalc_sb_count(struct ieee80211_sub_if_data *sdata, u64 tsf) { u32 sb_idx; struct ps_data *ps = &sdata->bss->ps; u8 lb_period = sdata->vif.bss_conf.s1g_long_beacon_period; u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; /* No mesh / IBSS support for short beaconing */ if (tsf == -1ULL || !lb_period || (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) return; /* find the current TSBTT index in our lb_period */ do_div(tsf, beacon_int); sb_idx = do_div(tsf, lb_period); /* num TSBTTs until the next TBTT */ ps->sb_count = sb_idx ? lb_period - sb_idx : 0; } static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; u8 radar_detect = 0; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; for_each_sdata_link(local, link) { if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { /* * An in-place reservation context should not have any * assigned links until it replaces the other context. */ WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER); if (link->radar_required) radar_detect |= BIT(link->conf->chanreq.oper.width); } if (link->reserved_chanctx == ctx && link->reserved_radar_required) radar_detect |= BIT(link->reserved.oper.width); } return radar_detect; } bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, struct cfg80211_scan_request *scan_req, int radio_idx) { struct ieee80211_channel *chan; int i, chan_radio_idx; for (i = 0; i < scan_req->n_channels; i++) { chan = scan_req->channels[i]; chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); /* The radio index either matched successfully, or an error * occurred. For example, if radio-level information is * missing, the same error value is returned. This * typically implies a single-radio setup, in which case * the operation should not be allowed. */ if (chan_radio_idx == radio_idx) return true; } return false; } static u32 __ieee80211_get_radio_mask(struct ieee80211_sub_if_data *sdata) { struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *conf; unsigned int link_id; u32 mask = 0; for_each_vif_active_link(&sdata->vif, link_conf, link_id) { conf = sdata_dereference(link_conf->chanctx_conf, sdata); if (!conf || conf->radio_idx < 0) continue; mask |= BIT(conf->radio_idx); } return mask; } u32 ieee80211_get_radio_mask(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return __ieee80211_get_radio_mask(sdata); } static bool ieee80211_sdata_uses_radio(struct ieee80211_sub_if_data *sdata, int radio_idx) { if (radio_idx < 0) return true; return __ieee80211_get_radio_mask(sdata) & BIT(radio_idx); } static int ieee80211_fill_ifcomb_params(struct ieee80211_local *local, struct iface_combination_params *params, const struct cfg80211_chan_def *chandef, struct ieee80211_sub_if_data *sdata) { struct ieee80211_sub_if_data *sdata_iter; struct ieee80211_chanctx *ctx; int total = !!sdata; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; if (params->radio_idx >= 0 && ctx->conf.radio_idx != params->radio_idx) continue; params->radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); if (chandef && ctx->mode != IEEE80211_CHANCTX_EXCLUSIVE && cfg80211_chandef_compatible(chandef, &ctx->conf.def)) continue; params->num_different_channels++; } list_for_each_entry(sdata_iter, &local->interfaces, list) { struct wireless_dev *wdev_iter; wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || !ieee80211_sdata_running(sdata_iter) || cfg80211_iftype_allowed(local->hw.wiphy, wdev_iter->iftype, 0, 1)) continue; if (!ieee80211_sdata_uses_radio(sdata_iter, params->radio_idx)) continue; params->iftype_num[wdev_iter->iftype]++; total++; } return total; } int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect, int radio_idx) { bool shared = chanmode == IEEE80211_CHANCTX_SHARED; struct ieee80211_local *local = sdata->local; enum nl80211_iftype iftype = sdata->wdev.iftype; struct iface_combination_params params = { .radar_detect = radar_detect, .radio_idx = radio_idx, }; int total; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(hweight32(radar_detect) > 1)) return -EINVAL; if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED && !chandef->chan)) return -EINVAL; if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return -EINVAL; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { /* * always passing this is harmless, since it'll be the * same value that cfg80211 finds if it finds the same * interface ... and that's always allowed */ params.new_beacon_int = sdata->vif.bss_conf.beacon_int; } /* Always allow software iftypes */ if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) { if (radar_detect) return -EINVAL; return 0; } if (chandef) params.num_different_channels = 1; if (iftype != NL80211_IFTYPE_UNSPECIFIED) params.iftype_num[iftype] = 1; total = ieee80211_fill_ifcomb_params(local, ¶ms, shared ? chandef : NULL, sdata); if (total == 1 && !params.radar_detect) return 0; return cfg80211_check_combinations(local->hw.wiphy, ¶ms); } static void ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c, void *data) { u32 *max_num_different_channels = data; *max_num_different_channels = max(*max_num_different_channels, c->num_different_channels); } int ieee80211_max_num_channels(struct ieee80211_local *local, int radio_idx) { u32 max_num_different_channels = 1; int err; struct iface_combination_params params = { .radio_idx = radio_idx, }; lockdep_assert_wiphy(local->hw.wiphy); ieee80211_fill_ifcomb_params(local, ¶ms, NULL, NULL); err = cfg80211_iter_combinations(local->hw.wiphy, ¶ms, ieee80211_iter_max_chans, &max_num_different_channels); if (err < 0) return err; return max_num_different_channels; } void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_s1g_cap s1g_capab; u8 *pos; int i; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; if (!caps->s1g) return; memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap)); memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs)); /* override the capability info */ for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) { u8 mask = ifmgd->s1g_capa_mask.capab_info[i]; s1g_capab.capab_info[i] &= ~mask; s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask; } /* then MCS and NSS set */ for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) { u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i]; s1g_capab.supp_mcs_nss[i] &= ~mask; s1g_capab.supp_mcs_nss[i] |= ifmgd->s1g_capa.supp_mcs_nss[i] & mask; } pos = skb_put(skb, 2 + sizeof(s1g_capab)); *pos++ = WLAN_EID_S1G_CAPABILITIES; *pos++ = sizeof(s1g_capab); memcpy(pos, &s1g_capab, sizeof(s1g_capab)); } void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { u8 *pos = skb_put(skb, 3); *pos++ = WLAN_EID_AID_REQUEST; *pos++ = 1; *pos++ = 0; } u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) { *buf++ = WLAN_EID_VENDOR_SPECIFIC; *buf++ = 7; /* len */ *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */ *buf++ = 0x50; *buf++ = 0xf2; *buf++ = 2; /* WME */ *buf++ = 0; /* WME info */ *buf++ = 1; /* WME ver */ *buf++ = qosinfo; /* U-APSD no in use */ return buf; } void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); u32 frag_cnt = 0, frag_bytes = 0; struct sk_buff *skb; skb_queue_walk(&txqi->frags, skb) { frag_cnt++; frag_bytes += skb->len; } if (frame_cnt) *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = { IEEE80211_WMM_IE_STA_QOSINFO_AC_VO, IEEE80211_WMM_IE_STA_QOSINFO_AC_VI, IEEE80211_WMM_IE_STA_QOSINFO_AC_BE, IEEE80211_WMM_IE_STA_QOSINFO_AC_BK }; u16 ieee80211_encode_usf(int listen_interval) { static const int listen_int_usf[] = { 1, 10, 1000, 10000 }; u16 ui, usf = 0; /* find greatest USF */ while (usf < IEEE80211_MAX_USF) { if (listen_interval % listen_int_usf[usf + 1]) break; usf += 1; } ui = listen_interval / listen_int_usf[usf]; /* error if there is a remainder. Should've been checked by user */ WARN_ON_ONCE(ui > IEEE80211_MAX_UI); listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) | FIELD_PREP(LISTEN_INT_UI, ui); return (u16) listen_interval; } /* this may return more than ieee80211_put_eht_cap() will need */ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_supported_band *sband; bool is_ap; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!he_cap || !eht_cap) return 0; is_ap = sdata->vif.type == NL80211_IFTYPE_AP; n = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, is_ap); return 2 + 1 + sizeof(eht_cap->eht_cap_elem) + n + ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], eht_cap->eht_cap_elem.phy_cap_info); return 0; } int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); bool for_ap = sdata->vif.type == NL80211_IFTYPE_AP; struct ieee80211_eht_cap_elem_fixed fixed; struct ieee80211_he_cap_elem he; u8 mcs_nss_len, ppet_len; u8 orig_mcs_nss_len; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; /* Make sure we have place for the IE */ if (!he_cap || !eht_cap) return 0; orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, for_ap); ieee80211_get_adjusted_he_cap(conn, he_cap, &he); fixed = eht_cap->eht_cap_elem; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80) fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) { fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ; } if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ; mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap); ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], fixed.phy_cap_info); ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len; if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, ie_len - 2); skb_put_u8(skb, WLAN_EID_EXT_EHT_CAPABILITY); skb_put_data(skb, &fixed, sizeof(fixed)); if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) { /* * If the (non-AP) STA became 20 MHz only, then convert from * <=80 to 20-MHz-only format, where MCSes are indicated in * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9, * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9. */ skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss); } else { skb_put_data(skb, &eht_cap->eht_mcs_nss_supp, mcs_nss_len); } if (ppet_len) skb_put_data(skb, &eht_cap->eht_ppe_thres, ppet_len); return 0; } int ieee80211_put_uhr_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband) { const struct ieee80211_sta_uhr_cap *uhr_cap = ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif); int len; if (!uhr_cap) return 0; len = 2 + 1 + sizeof(struct ieee80211_uhr_cap) + sizeof(struct ieee80211_uhr_cap_phy); if (skb_tailroom(skb) < len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, len - 2); skb_put_u8(skb, WLAN_EID_EXT_UHR_CAPA); skb_put_data(skb, &uhr_cap->mac, sizeof(uhr_cap->mac)); skb_put_data(skb, &uhr_cap->phy, sizeof(uhr_cap->phy)); return 0; } const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) { static const char * const modes[] = { [IEEE80211_CONN_MODE_S1G] = "S1G", [IEEE80211_CONN_MODE_LEGACY] = "legacy", [IEEE80211_CONN_MODE_HT] = "HT", [IEEE80211_CONN_MODE_VHT] = "VHT", [IEEE80211_CONN_MODE_HE] = "HE", [IEEE80211_CONN_MODE_EHT] = "EHT", [IEEE80211_CONN_MODE_UHR] = "UHR", }; if (WARN_ON(mode >= ARRAY_SIZE(modes))) return "<out of range>"; return modes[mode] ?: "<missing string>"; } enum ieee80211_conn_bw_limit ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef) { switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_CONN_BW_LIMIT_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_CONN_BW_LIMIT_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_CONN_BW_LIMIT_80; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: return IEEE80211_CONN_BW_LIMIT_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_CONN_BW_LIMIT_320; default: WARN(1, "unhandled chandef width %d\n", chandef->width); return IEEE80211_CONN_BW_LIMIT_20; } } void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) { for (int i = 0; i < 2; i++) { tpe->max_local[i].valid = false; memset(tpe->max_local[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_local[i].power)); tpe->max_reg_client[i].valid = false; memset(tpe->max_reg_client[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_reg_client[i].power)); tpe->psd_local[i].valid = false; memset(tpe->psd_local[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_local[i].power)); tpe->psd_reg_client[i].valid = false; memset(tpe->psd_reg_client[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_reg_client[i].power)); } } bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; } EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); |
| 14512 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLER__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> #include <asm/cpufeaturemasks.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_LNX_6, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This is the default CPU features testing macro to use in code. * * It is for detection of features which need kernel infrastructure to be * used. It may *not* directly test the CPU itself. Use the cpu_has() family * if you want true runtime testing of CPU features, like in hypervisor code * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); void check_cpufeature_deps(struct cpuinfo_x86 *c); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" ANNOTATE_DATA_SPECIAL "\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */ #endif /* _ASM_X86_CPUFEATURE_H */ |
| 649 5852 822 352 468 4558 3763 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC_H #define _ASM_X86_ATOMIC_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> #include <asm/barrier.h> /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. */ static __always_inline int arch_atomic_read(const atomic_t *v) { /* * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here, * it's non-inlined function that increases binary size and stack usage. */ return __READ_ONCE((v)->counter); } static __always_inline void arch_atomic_set(atomic_t *v, int i) { __WRITE_ONCE(v->counter, i); } static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "addl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "subl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline void arch_atomic_inc(atomic_t *v) { asm_inline volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_dec(atomic_t *v) { asm_inline volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic_add_return arch_atomic_add_return #define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } #define arch_atomic_fetch_add arch_atomic_fetch_add #define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic_cmpxchg arch_atomic_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg static __always_inline int arch_atomic_xchg(atomic_t *v, int new) { return arch_xchg(&v->counter, new); } #define arch_atomic_xchg arch_atomic_xchg static __always_inline void arch_atomic_and(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "andl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i)); return val; } #define arch_atomic_fetch_and arch_atomic_fetch_and static __always_inline void arch_atomic_or(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "orl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i)); return val; } #define arch_atomic_fetch_or arch_atomic_fetch_or static __always_inline void arch_atomic_xor(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "xorl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i)); return val; } #define arch_atomic_fetch_xor arch_atomic_fetch_xor #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else # include <asm/atomic64_64.h> #endif #endif /* _ASM_X86_ATOMIC_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Queue of folios definitions * * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See: * * Documentation/core-api/folio_queue.rst * * for a description of the API. */ #ifndef _LINUX_FOLIO_QUEUE_H #define _LINUX_FOLIO_QUEUE_H #include <linux/folio_batch.h> #include <linux/mm.h> /* * Segment in a queue of running buffers. Each segment can hold a number of * folios and a portion of the queue can be referenced with the ITER_FOLIOQ * iterator. The possibility exists of inserting non-folio elements into the * queue (such as gaps). * * Explicit prev and next pointers are used instead of a list_head to make it * easier to add segments to tail and remove them from the head without the * need for a lock. */ struct folio_queue { struct folio_batch vec; /* Folios in the queue segment */ u8 orders[FOLIO_BATCH_SIZE]; /* Order of each folio */ struct folio_queue *next; /* Next queue segment or NULL */ struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ #if FOLIO_BATCH_SIZE > BITS_PER_LONG #error marks is not big enough #endif unsigned int rreq_id; unsigned int debug_id; }; /** * folioq_init - Initialise a folio queue segment * @folioq: The segment to initialise * @rreq_id: The request identifier to use in tracelines. * * Initialise a folio queue segment and set an identifier to be used in traces. * * Note that the folio pointers are left uninitialised. */ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) { folio_batch_init(&folioq->vec); folioq->next = NULL; folioq->prev = NULL; folioq->marks = 0; folioq->marks2 = 0; folioq->rreq_id = rreq_id; folioq->debug_id = 0; } /** * folioq_nr_slots: Query the capacity of a folio queue segment * @folioq: The segment to query * * Query the number of folios that a particular folio queue segment might hold. * [!] NOTE: This must not be assumed to be the same for every segment! */ static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) { return FOLIO_BATCH_SIZE; } /** * folioq_count: Query the occupancy of a folio queue segment * @folioq: The segment to query * * Query the number of folios that have been added to a folio queue segment. * Note that this is not decreased as folios are removed from a segment. */ static inline unsigned int folioq_count(struct folio_queue *folioq) { return folio_batch_count(&folioq->vec); } /** * folioq_full: Query if a folio queue segment is full * @folioq: The segment to query * * Query if a folio queue segment is fully occupied. Note that this does not * change if folios are removed from a segment. */ static inline bool folioq_full(struct folio_queue *folioq) { //return !folio_batch_space(&folioq->vec); return folioq_count(folioq) >= folioq_nr_slots(folioq); } /** * folioq_is_marked: Check first folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the first mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks); } /** * folioq_mark: Set the first mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the first mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks); } /** * folioq_unmark: Clear the first mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the first mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks); } /** * folioq_is_marked2: Check second folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the second mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks2); } /** * folioq_mark2: Set the second mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the second mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks2); } /** * folioq_unmark2: Clear the second mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the second mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks2); } /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to * @folio: The folio to add * * Add a folio to the tail of the sequence in a folio queue segment, increasing * the occupancy count and returning the slot number for the folio just added. * The folio size is extracted and stored in the queue and the marks are left * unmodified. * * Note that it's left up to the caller to check that the segment capacity will * not be exceeded and to extend the queue. */ static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) { unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; folioq->orders[slot] = folio_order(folio); return slot; } /** * folioq_append_mark: Add a folio to a folio queue segment * @folioq: The segment to add to * @folio: The folio to add * * Add a folio to the tail of the sequence in a folio queue segment, increasing * the occupancy count and returning the slot number for the folio just added. * The folio size is extracted and stored in the queue, the first mark is set * and and the second and third marks are left unmodified. * * Note that it's left up to the caller to check that the segment capacity will * not be exceeded and to extend the queue. */ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) { unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; folioq->orders[slot] = folio_order(folio); folioq_mark(folioq, slot); return slot; } /** * folioq_folio: Get a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the folio in the specified slot from a folio queue segment. Note * that no bounds check is made and if the slot hasn't been added into yet, the * pointer will be undefined. If the slot has been cleared, NULL will be * returned. */ static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) { return folioq->vec.folios[slot]; } /** * folioq_folio_order: Get the order of a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the order of the folio in the specified slot from a folio queue * segment. Note that no bounds check is made and if the slot hasn't been * added into yet, the order returned will be 0. */ static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) { return folioq->orders[slot]; } /** * folioq_folio_size: Get the size of a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the size of the folio in the specified slot from a folio queue * segment. Note that no bounds check is made and if the slot hasn't been * added into yet, the size returned will be PAGE_SIZE. */ static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) { return PAGE_SIZE << folioq_folio_order(folioq, slot); } /** * folioq_clear: Clear a folio from a folio queue segment * @folioq: The segment to clear * @slot: The folio slot to clear * * Clear a folio from a sequence in a folio queue segment and clear its marks. * The occupancy count is left unchanged. */ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) { folioq->vec.folios[slot] = NULL; folioq_unmark(folioq, slot); folioq_unmark2(folioq, slot); } #endif /* _LINUX_FOLIO_QUEUE_H */ |
| 23 3 2 3 26 19 26 2 26 26 26 14 26 26 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 21 22 22 22 22 22 22 22 22 22 22 22 22 22 22 2 2 2 2 2 2 22 22 22 1 21 22 21 2 2 22 22 22 16 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - Cross-thread ruleset enforcement * * Copyright © 2025 Google LLC */ #include <linux/atomic.h> #include <linux/cleanup.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/overflow.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/task_work.h> #include "cred.h" #include "tsync.h" /* * Shared state between multiple threads which are enforcing Landlock rulesets * in lockstep with each other. */ struct tsync_shared_context { /* The old and tentative new creds of the calling thread. */ const struct cred *old_cred; const struct cred *new_cred; /* True if sibling tasks need to set the no_new_privs flag. */ bool set_no_new_privs; /* An error encountered in preparation step, or 0. */ atomic_t preparation_error; /* * Barrier after preparation step in restrict_one_thread. * The calling thread waits for completion. * * Re-initialized on every round of looking for newly spawned threads. */ atomic_t num_preparing; struct completion all_prepared; /* Sibling threads wait for completion. */ struct completion ready_to_commit; /* * Barrier after commit step (used by syscall impl to wait for * completion). */ atomic_t num_unfinished; struct completion all_finished; }; struct tsync_work { struct callback_head work; struct task_struct *task; struct tsync_shared_context *shared_ctx; }; /* * restrict_one_thread - update a thread's Landlock domain in lockstep with the * other threads in the same process * * When this is run, the same function gets run in all other threads in the same * process (except for the calling thread which called landlock_restrict_self). * The concurrently running invocations of restrict_one_thread coordinate * through the shared ctx object to do their work in lockstep to implement * all-or-nothing semantics for enforcing the new Landlock domain. * * Afterwards, depending on the presence of an error, all threads either commit * or abort the prepared credentials. The commit operation can not fail any * more. */ static void restrict_one_thread(struct tsync_shared_context *ctx) { int err; struct cred *cred = NULL; if (current_cred() == ctx->old_cred) { /* * Switch out old_cred with new_cred, if possible. * * In the common case, where all threads initially point to the * same struct cred, this optimization avoids creating separate * redundant credentials objects for each, which would all have * the same contents. * * Note: We are intentionally dropping the const qualifier * here, because it is required by commit_creds() and * abort_creds(). */ cred = (struct cred *)get_cred(ctx->new_cred); } else { /* Else, prepare new creds and populate them. */ cred = prepare_creds(); if (!cred) { atomic_set(&ctx->preparation_error, -ENOMEM); /* * Even on error, we need to adhere to the protocol and * coordinate with concurrently running invocations. */ if (atomic_dec_return(&ctx->num_preparing) == 0) complete_all(&ctx->all_prepared); goto out; } landlock_cred_copy(landlock_cred(cred), landlock_cred(ctx->new_cred)); } /* * Barrier: Wait until all threads are done preparing. * After this point, we can have no more failures. */ if (atomic_dec_return(&ctx->num_preparing) == 0) complete_all(&ctx->all_prepared); /* * Wait for signal from calling thread that it's safe to read the * preparation error now and we are ready to commit (or abort). */ wait_for_completion(&ctx->ready_to_commit); /* Abort the commit if any of the other threads had an error. */ err = atomic_read(&ctx->preparation_error); if (err) { abort_creds(cred); goto out; } /* * Make sure that all sibling tasks fulfill the no_new_privs * prerequisite. (This is in line with Seccomp's * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c) */ if (ctx->set_no_new_privs) task_set_no_new_privs(current); commit_creds(cred); out: /* Notify the calling thread once all threads are done */ if (atomic_dec_return(&ctx->num_unfinished) == 0) complete_all(&ctx->all_finished); } /* * restrict_one_thread_callback - task_work callback for restricting a thread * * Calls restrict_one_thread with the struct landlock_shared_tsync_context. */ static void restrict_one_thread_callback(struct callback_head *work) { struct tsync_work *ctx = container_of(work, struct tsync_work, work); restrict_one_thread(ctx->shared_ctx); } /* * struct tsync_works - a growable array of per-task contexts * * The zero-initialized struct represents the empty array. */ struct tsync_works { struct tsync_work **works; size_t size; size_t capacity; }; /* * tsync_works_provide - provides a preallocated tsync_work for the given task * * This also stores a task pointer in the context and increments the reference * count of the task. * * This function may fail in the case where we did not preallocate sufficient * capacity. This can legitimately happen if new threads get started after we * grew the capacity. * * Return: A pointer to the preallocated context struct with task filled in, or * NULL if preallocated context structs ran out. */ static struct tsync_work *tsync_works_provide(struct tsync_works *s, struct task_struct *task) { struct tsync_work *ctx; if (s->size >= s->capacity) return NULL; ctx = s->works[s->size]; s->size++; ctx->task = get_task_struct(task); return ctx; } /** * tsync_works_trim - Put the last tsync_work element * * @s: TSYNC works to trim. * * Put the last task and decrement the size of @s. * * This helper does not cancel a running task, but just reset the last element * to zero. */ static void tsync_works_trim(struct tsync_works *s) { struct tsync_work *ctx; if (WARN_ON_ONCE(s->size <= 0)) return; ctx = s->works[s->size - 1]; /* * For consistency, remove the task from ctx so that it does not look * like we handed it a task_work. */ put_task_struct(ctx->task); *ctx = (typeof(*ctx)){}; /* * Cancel the tsync_works_provide() change to recycle the reserved * memory for the next thread, if any. This also ensures that * cancel_tsync_works() and tsync_works_release() do not see any NULL * task pointers. */ s->size--; } /* * tsync_works_grow_by - preallocates space for n more contexts in s * * On a successful return, the subsequent n calls to tsync_works_provide() are * guaranteed to succeed. (size + n <= capacity) * * Return: 0 if sufficient space for n more elements could be provided, -ENOMEM * on allocation errors, -EOVERFLOW in case of integer overflow. */ static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags) { size_t i; size_t new_capacity; struct tsync_work **works; struct tsync_work *work; if (check_add_overflow(s->size, n, &new_capacity)) return -EOVERFLOW; /* No need to reallocate if s already has sufficient capacity. */ if (new_capacity <= s->capacity) return 0; works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]), flags); if (!works) return -ENOMEM; s->works = works; for (i = s->capacity; i < new_capacity; i++) { work = kzalloc_obj(*work, flags); if (!work) { /* * Leave the object in a consistent state, * but return an error. */ s->capacity = i; return -ENOMEM; } s->works[i] = work; } s->capacity = new_capacity; return 0; } /* * tsync_works_contains - checks for presence of task in s */ static bool tsync_works_contains_task(const struct tsync_works *s, const struct task_struct *task) { size_t i; for (i = 0; i < s->size; i++) if (s->works[i]->task == task) return true; return false; } /* * tsync_works_release - frees memory held by s and drops all task references * * This does not free s itself, only the data structures held by it. */ static void tsync_works_release(struct tsync_works *s) { size_t i; for (i = 0; i < s->size; i++) { if (WARN_ON_ONCE(!s->works[i]->task)) continue; put_task_struct(s->works[i]->task); } for (i = 0; i < s->capacity; i++) kfree(s->works[i]); kfree(s->works); s->works = NULL; s->size = 0; s->capacity = 0; } /* * count_additional_threads - counts the sibling threads that are not in works */ static size_t count_additional_threads(const struct tsync_works *works) { const struct task_struct *caller, *thread; size_t n = 0; caller = current; guard(rcu)(); for_each_thread(caller, thread) { /* Skip current, since it is initiating the sync. */ if (thread == caller) continue; /* Skip exited threads. */ if (thread->flags & PF_EXITING) continue; /* Skip threads that we have already seen. */ if (tsync_works_contains_task(works, thread)) continue; n++; } return n; } /* * schedule_task_work - adds task_work for all eligible sibling threads * which have not been scheduled yet * * For each added task_work, atomically increments shared_ctx->num_preparing and * shared_ctx->num_unfinished. * * Return: True if at least one eligible sibling thread was found, false * otherwise. */ static bool schedule_task_work(struct tsync_works *works, struct tsync_shared_context *shared_ctx) { int err; const struct task_struct *caller; struct task_struct *thread; struct tsync_work *ctx; bool found_more_threads = false; caller = current; guard(rcu)(); for_each_thread(caller, thread) { /* Skip current, since it is initiating the sync. */ if (thread == caller) continue; /* Skip exited threads. */ if (thread->flags & PF_EXITING) continue; /* Skip threads that we already looked at. */ if (tsync_works_contains_task(works, thread)) continue; /* * We found a sibling thread that is not doing its task_work * yet, and which might spawn new threads before our task work * runs, so we need at least one more round in the outer loop. */ found_more_threads = true; ctx = tsync_works_provide(works, thread); if (!ctx) { /* * We ran out of preallocated contexts -- we need to * try again with this thread at a later time! * found_more_threads is already true at this point. */ break; } ctx->shared_ctx = shared_ctx; atomic_inc(&shared_ctx->num_preparing); atomic_inc(&shared_ctx->num_unfinished); init_task_work(&ctx->work, restrict_one_thread_callback); err = task_work_add(thread, &ctx->work, TWA_SIGNAL); if (unlikely(err)) { /* * task_work_add() only fails if the task is about to * exit. We checked that earlier, but it can happen as * a race. Resume without setting an error, as the * task is probably gone in the next loop iteration. */ tsync_works_trim(works); atomic_dec(&shared_ctx->num_preparing); atomic_dec(&shared_ctx->num_unfinished); } } return found_more_threads; } /* * cancel_tsync_works - cancel all task works where it is possible * * Task works can be canceled as long as they are still queued and have not * started running. If they get canceled, we decrement * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two * completions if needed, as if the task was never scheduled. */ static void cancel_tsync_works(const struct tsync_works *works, struct tsync_shared_context *shared_ctx) { size_t i; for (i = 0; i < works->size; i++) { if (WARN_ON_ONCE(!works->works[i]->task)) continue; if (!task_work_cancel(works->works[i]->task, &works->works[i]->work)) continue; /* After dequeueing, act as if the task work had executed. */ if (atomic_dec_return(&shared_ctx->num_preparing) == 0) complete_all(&shared_ctx->all_prepared); if (atomic_dec_return(&shared_ctx->num_unfinished) == 0) complete_all(&shared_ctx->all_finished); } } /* * restrict_sibling_threads - enables a Landlock policy for all sibling threads */ int landlock_restrict_sibling_threads(const struct cred *old_cred, const struct cred *new_cred) { int err; struct tsync_shared_context shared_ctx; struct tsync_works works = {}; size_t newly_discovered_threads; bool found_more_threads; atomic_set(&shared_ctx.preparation_error, 0); init_completion(&shared_ctx.all_prepared); init_completion(&shared_ctx.ready_to_commit); atomic_set(&shared_ctx.num_unfinished, 1); init_completion(&shared_ctx.all_finished); shared_ctx.old_cred = old_cred; shared_ctx.new_cred = new_cred; shared_ctx.set_no_new_privs = task_no_new_privs(current); /* * Serialize concurrent TSYNC operations to prevent deadlocks when * multiple threads call landlock_restrict_self() simultaneously. * If the lock is already held, we gracefully yield by restarting the * syscall. This allows the current thread to process pending * task_works before retrying. */ if (!down_write_trylock(¤t->signal->exec_update_lock)) return restart_syscall(); /* * We schedule a pseudo-signal task_work for each of the calling task's * sibling threads. In the task work, each thread: * * 1) runs prepare_creds() and writes back the error to * shared_ctx.preparation_error, if needed. * * 2) signals that it's done with prepare_creds() to the calling task. * (completion "all_prepared"). * * 3) waits for the completion "ready_to_commit". This is sent by the * calling task after ensuring that all sibling threads have done * with the "preparation" stage. * * After this barrier is reached, it's safe to read * shared_ctx.preparation_error. * * 4) reads shared_ctx.preparation_error and then either does * commit_creds() or abort_creds(). * * 5) signals that it's done altogether (barrier synchronization * "all_finished") * * Unlike seccomp, which modifies sibling tasks directly, we do not * need to acquire the cred_guard_mutex and sighand->siglock: * * - As in our case, all threads are themselves exchanging their own * struct cred through the credentials API, no locks are needed for * that. * - Our for_each_thread() loops are protected by RCU. * - We do not acquire a lock to keep the list of sibling threads * stable between our for_each_thread loops. If the list of * available sibling threads changes between these for_each_thread * loops, we make up for that by continuing to look for threads until * they are all discovered and have entered their task_work, where * they are unable to spawn new threads. */ do { /* In RCU read-lock, count the threads we need. */ newly_discovered_threads = count_additional_threads(&works); if (newly_discovered_threads == 0) break; /* done */ err = tsync_works_grow_by(&works, newly_discovered_threads, GFP_KERNEL_ACCOUNT); if (err) { atomic_set(&shared_ctx.preparation_error, err); break; } /* * The "all_prepared" barrier is used locally to the loop body, * this use of for_each_thread(). We can reset it on each loop * iteration because all previous loop iterations are done with * it already. * * num_preparing is initialized to 1 so that the counter can * not go to 0 and mark the completion as done before all task * works are registered. We decrement it at the end of the * loop body. */ atomic_set(&shared_ctx.num_preparing, 1); reinit_completion(&shared_ctx.all_prepared); /* * In RCU read-lock, schedule task work on newly discovered * sibling tasks. */ found_more_threads = schedule_task_work(&works, &shared_ctx); /* * Decrement num_preparing for current, to undo that we * initialized it to 1 a few lines above. */ if (atomic_dec_return(&shared_ctx.num_preparing) > 0) { if (wait_for_completion_interruptible( &shared_ctx.all_prepared)) { /* * In case of interruption, we need to retry * the system call. */ atomic_set(&shared_ctx.preparation_error, -ERESTARTNOINTR); /* * Opportunistic improvement: try to cancel task * works for tasks that did not start running * yet. We do not have a guarantee that it * cancels any of the enqueued task works * because task_work_run() might already have * dequeued them. */ cancel_tsync_works(&works, &shared_ctx); /* * Break the loop with error. The cleanup code * after the loop unblocks the remaining * task_works. */ break; } } } while (found_more_threads && !atomic_read(&shared_ctx.preparation_error)); /* * We now have either (a) all sibling threads blocking and in "prepared" * state in the task work, or (b) the preparation error is set. Ask all * threads to commit (or abort). */ complete_all(&shared_ctx.ready_to_commit); /* * Decrement num_unfinished for current, to undo that we initialized it * to 1 at the beginning. */ if (atomic_dec_return(&shared_ctx.num_unfinished) > 0) wait_for_completion(&shared_ctx.all_finished); tsync_works_release(&works); up_write(¤t->signal->exec_update_lock); return atomic_read(&shared_ctx.preparation_error); } |
| 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. */ /* * Quota change tags are associated with each transaction that allocates or * deallocates space. Those changes are accumulated locally to each node (in a * per-node file) and then are periodically synced to the quota file. This * avoids the bottleneck of constantly touching the quota file, but introduces * fuzziness in the current usage value of IDs that are being used on different * nodes in the cluster simultaneously. So, it is possible for a user on * multiple nodes to overrun their quota, but that overrun is controlable. * Since quota tags are part of transactions, there is no need for a quota check * program to be run on node crashes or anything like that. * * There are couple of knobs that let the administrator manage the quota * fuzziness. "quota_quantum" sets the maximum time a quota change can be * sitting on one node before being synced to the quota file. (The default is * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency * of quota file syncs increases as the user moves closer to their limit. The * more frequent the syncs, the more accurate the quota enforcement, but that * means that there is more contention between the nodes for the quota file. * The default value is one. This sets the maximum theoretical quota overrun * (with infinite node with infinite bandwidth) to twice the user's limit. (In * practice, the maximum overrun you see should be much less.) A "quota_scale" * number greater than one makes quota syncs more frequent and reduces the * maximum overrun. Numbers less than one (but greater than zero) make quota * syncs less frequent. * * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of * the quota file, so it is not being constantly read. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/sort.h> #include <linux/fs.h> #include <linux/bio.h> #include <linux/gfs2_ondisk.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/quota.h> #include <linux/dqblk_xfs.h> #include <linux/lockref.h> #include <linux/list_lru.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/jhash.h> #include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "glops.h" #include "log.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "super.h" #include "trans.h" #include "inode.h" #include "util.h" #define GFS2_QD_HASH_SHIFT 12 #define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ /* -> sd_bitmap_lock */ static DEFINE_SPINLOCK(qd_lock); struct list_lru gfs2_qd_lru; static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, const struct kqid qid) { unsigned int h; h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); h = jhash(&qid, sizeof(struct kqid), h); return h & GFS2_QD_HASH_MASK; } static inline void spin_lock_bucket(unsigned int hash) { hlist_bl_lock(&qd_hash_table[hash]); } static inline void spin_unlock_bucket(unsigned int hash) { hlist_bl_unlock(&qd_hash_table[hash]); } static void gfs2_qd_dealloc(struct rcu_head *rcu) { struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); struct gfs2_sbd *sdp = qd->qd_sbd; kmem_cache_free(gfs2_quotad_cachep, qd); if (atomic_dec_and_test(&sdp->sd_quota_count)) wake_up(&sdp->sd_kill_wait); } static void gfs2_qd_dispose(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&qd_lock); list_del(&qd->qd_list); spin_unlock(&qd_lock); spin_lock_bucket(qd->qd_hash); hlist_bl_del_rcu(&qd->qd_hlist); spin_unlock_bucket(qd->qd_hash); if (!gfs2_withdrawn(sdp)) { gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_ref); gfs2_assert_warn(sdp, !qd->qd_bh_count); } gfs2_glock_put(qd->qd_gl); call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); } static void gfs2_qd_list_dispose(struct list_head *list) { struct gfs2_quota_data *qd; while (!list_empty(list)) { qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); list_del(&qd->qd_lru); gfs2_qd_dispose(qd); } } static enum lru_status gfs2_qd_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); enum lru_status status; if (!spin_trylock(&qd->qd_lockref.lock)) return LRU_SKIP; status = LRU_SKIP; if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); list_lru_isolate_move(lru, &qd->qd_lru, dispose); status = LRU_REMOVED; } spin_unlock(&qd->qd_lockref.lock); return status; } static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(dispose); unsigned long freed; if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, gfs2_qd_isolate, &dispose); gfs2_qd_list_dispose(&dispose); return freed; } static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } static struct shrinker *gfs2_qd_shrinker; int __init gfs2_qd_shrinker_init(void) { gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); if (!gfs2_qd_shrinker) return -ENOMEM; gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; shrinker_register(gfs2_qd_shrinker); return 0; } void gfs2_qd_shrinker_exit(void) { shrinker_free(gfs2_qd_shrinker); } static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; return (2 * (u64)from_kqid(&init_user_ns, qid)) + ((qid.type == USRQUOTA) ? 0 : 1); } static u64 qd2offset(struct gfs2_quota_data *qd) { return qd2index(qd) * sizeof(struct gfs2_quota); } static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) return NULL; qd->qd_sbd = sdp; lockref_init(&qd->qd_lockref); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); qd->qd_hash = hash; error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; return qd; fail: kmem_cache_free(gfs2_quotad_cachep, qd); return NULL; } static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, const struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct hlist_bl_node *h; hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { if (!qid_eq(qd->qd_id, qid)) continue; if (qd->qd_sbd != sdp) continue; if (lockref_get_not_dead(&qd->qd_lockref)) { list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); return qd; } } return NULL; } static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd, *new_qd; unsigned int hash = gfs2_qd_hash(sdp, qid); rcu_read_lock(); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); rcu_read_unlock(); if (qd) return 0; new_qd = qd_alloc(hash, sdp, qid); if (!new_qd) return -ENOMEM; spin_lock(&qd_lock); spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); atomic_inc(&sdp->sd_quota_count); } spin_unlock_bucket(hash); spin_unlock(&qd_lock); if (qd) { gfs2_glock_put(new_qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, new_qd); } return 0; } static void __qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; gfs2_assert(sdp, qd->qd_lockref.count > 0); qd->qd_lockref.count++; } static void qd_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp; if (lockref_put_or_lock(&qd->qd_lockref)) return; BUG_ON(__lockref_is_dead(&qd->qd_lockref)); sdp = qd->qd_sbd; if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); gfs2_qd_dispose(qd); return; } qd->qd_lockref.count = 0; list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); } static int slot_get(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; unsigned int bit; int error = 0; spin_lock(&sdp->sd_bitmap_lock); if (qd->qd_slot_ref == 0) { bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); if (bit >= sdp->sd_quota_slots) { error = -ENOSPC; goto out; } set_bit(bit, sdp->sd_quota_bitmap); qd->qd_slot = bit; } qd->qd_slot_ref++; out: spin_unlock(&sdp->sd_bitmap_lock); return error; } static void slot_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_ref); qd->qd_slot_ref++; spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_ref); if (!--qd->qd_slot_ref) { BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; struct inode *inode = sdp->sd_qc_inode; struct gfs2_inode *ip = GFS2_I(inode); unsigned int block, offset; struct buffer_head *bh = NULL; struct iomap iomap = { }; int error; spin_lock(&qd->qd_lockref.lock); if (qd->qd_bh_count) { qd->qd_bh_count++; spin_unlock(&qd->qd_lockref.lock); return 0; } spin_unlock(&qd->qd_lockref.lock); block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; error = gfs2_iomap_get(inode, (loff_t)block << inode->i_blkbits, i_blocksize(inode), &iomap); if (error) return error; error = -ENOENT; if (iomap.type != IOMAP_MAPPED) return error; error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits, DIO_WAIT, 0, &bh); if (error) return error; error = -EIO; if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) goto out; spin_lock(&qd->qd_lockref.lock); if (qd->qd_bh == NULL) { qd->qd_bh = bh; qd->qd_bh_qc = (struct gfs2_quota_change *) (bh->b_data + sizeof(struct gfs2_meta_header) + offset * sizeof(struct gfs2_quota_change)); bh = NULL; } qd->qd_bh_count++; spin_unlock(&qd->qd_lockref.lock); error = 0; out: brelse(bh); return error; } static void bh_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; struct buffer_head *bh = NULL; spin_lock(&qd->qd_lockref.lock); gfs2_assert(sdp, qd->qd_bh_count); if (!--qd->qd_bh_count) { bh = qd->qd_bh; qd->qd_bh = NULL; qd->qd_bh_qc = NULL; } spin_unlock(&qd->qd_lockref.lock); brelse(bh); } static bool qd_grab_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, u64 sync_gen) { bool ret = false; spin_lock(&qd->qd_lockref.lock); if (test_bit(QDF_LOCKED, &qd->qd_flags) || !test_bit(QDF_CHANGE, &qd->qd_flags) || qd->qd_sync_gen >= sync_gen) goto out; if (__lockref_is_dead(&qd->qd_lockref)) goto out; qd->qd_lockref.count++; list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); qd->qd_change_sync = qd->qd_change; slot_hold(qd); ret = true; out: spin_unlock(&qd->qd_lockref.lock); return ret; } static void qd_ungrab_sync(struct gfs2_quota_data *qd) { clear_bit(QDF_LOCKED, &qd->qd_flags); slot_put(qd); qd_put(qd); } static void qdsb_put(struct gfs2_quota_data *qd) { bh_put(qd); slot_put(qd); qd_put(qd); } static void qd_unlock(struct gfs2_quota_data *qd) { spin_lock(&qd->qd_lockref.lock); gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags)); clear_bit(QDF_LOCKED, &qd->qd_flags); spin_unlock(&qd->qd_lockref.lock); qdsb_put(qd); } static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { int error; error = qd_get(sdp, qid, qdp); if (error) return error; error = slot_get(*qdp); if (error) goto fail; error = bh_get(*qdp); if (error) goto fail_slot; return 0; fail_slot: slot_put(*qdp); fail: qd_put(*qdp); return error; } /** * gfs2_qa_get - make sure we have a quota allocations data structure, * if necessary * @ip: the inode for this reservation */ int gfs2_qa_get(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; spin_lock(&inode->i_lock); if (ip->i_qadata == NULL) { struct gfs2_qadata *tmp; spin_unlock(&inode->i_lock); tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); if (!tmp) return -ENOMEM; spin_lock(&inode->i_lock); if (ip->i_qadata == NULL) ip->i_qadata = tmp; else kmem_cache_free(gfs2_qadata_cachep, tmp); } ip->i_qadata->qa_ref++; spin_unlock(&inode->i_lock); return 0; } void gfs2_qa_put(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; spin_lock(&inode->i_lock); if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) { kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); ip->i_qadata = NULL; } spin_unlock(&inode->i_lock); } int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data **qd; int error; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; error = gfs2_qa_get(ip); if (error) return error; qd = ip->i_qadata->qa_qd; if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) { error = -EIO; gfs2_qa_put(ip); goto out; } error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && !uid_eq(uid, ip->i_inode.i_uid)) { error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; } if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && !gid_eq(gid, ip->i_inode.i_gid)) { error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; } out_unhold: if (error) gfs2_quota_unhold(ip); out: return error; } void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u32 x; if (ip->i_qadata == NULL) return; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qdsb_put(ip->i_qadata->qa_qd[x]); ip->i_qadata->qa_qd[x] = NULL; } ip->i_qadata->qa_qd_num = 0; gfs2_qa_put(ip); } static int sort_qd(const void *a, const void *b) { const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; if (qid_lt(qd_a->qd_id, qd_b->qd_id)) return -1; if (qid_lt(qd_b->qd_id, qd_a->qd_id)) return 1; return 0; } static void do_qc(struct gfs2_quota_data *qd, s64 change) { struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); struct gfs2_quota_change *qc = qd->qd_bh_qc; bool needs_put = false; s64 x; gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); /* * The QDF_CHANGE flag indicates that the slot in the quota change file * is used. Here, we use the value of qc->qc_change when the slot is * used, and we assume a value of 0 otherwise. */ spin_lock(&qd->qd_lockref.lock); x = 0; if (test_bit(QDF_CHANGE, &qd->qd_flags)) x = be64_to_cpu(qc->qc_change); x += change; qd->qd_change += change; if (!x && test_bit(QDF_CHANGE, &qd->qd_flags)) { /* The slot in the quota change file becomes unused. */ clear_bit(QDF_CHANGE, &qd->qd_flags); qc->qc_flags = 0; qc->qc_id = 0; needs_put = true; } else if (x && !test_bit(QDF_CHANGE, &qd->qd_flags)) { /* The slot in the quota change file becomes used. */ set_bit(QDF_CHANGE, &qd->qd_flags); __qd_hold(qd); slot_hold(qd); qc->qc_flags = 0; if (qd->qd_id.type == USRQUOTA) qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); } qc->qc_change = cpu_to_be64(x); spin_unlock(&qd->qd_lockref.lock); if (needs_put) { slot_put(qd); qd_put(qd); } if (change < 0) /* Reset quiet flag if we freed some blocks */ clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); } static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, unsigned off, void *buf, unsigned bytes) { struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; struct address_space *mapping = inode->i_mapping; struct folio *folio; struct buffer_head *bh; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; unsigned to_write = bytes, pg_off = off; blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return PTR_ERR(folio); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, bsize, 0); for (;;) { /* Find the beginning block within the folio */ if (pg_off >= ((bnum * bsize) + bsize)) { bh = bh->b_this_page; bnum++; blk++; continue; } if (!buffer_mapped(bh)) { gfs2_block_map(inode, blk, bh, 1); if (!buffer_mapped(bh)) goto unlock_out; /* If it's a newly allocated disk block, zero it */ if (buffer_new(bh)) folio_zero_range(folio, bnum * bsize, bh->b_size); } if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (bh_read(bh, REQ_META | REQ_PRIO) < 0) goto unlock_out; gfs2_trans_add_data(ip->i_gl, bh); /* If we need to write to the next block as well */ if (to_write > (bsize - boff)) { pg_off += (bsize - boff); to_write -= (bsize - boff); boff = pg_off % bsize; continue; } break; } /* Write to the folio, now that we have setup the buffer(s) */ memcpy_to_folio(folio, off, buf, bytes); flush_dcache_folio(folio); folio_unlock(folio); folio_put(folio); return 0; unlock_out: folio_unlock(folio); folio_put(folio); return -EIO; } static int gfs2_write_disk_quota(struct gfs2_sbd *sdp, struct gfs2_quota *qp, loff_t loc) { unsigned long pg_beg; unsigned pg_off, nbytes, overflow = 0; int error; void *ptr; nbytes = sizeof(struct gfs2_quota); pg_beg = loc >> PAGE_SHIFT; pg_off = offset_in_page(loc); /* If the quota straddles a page boundary, split the write in two */ if ((pg_off + nbytes) > PAGE_SIZE) overflow = (pg_off + nbytes) - PAGE_SIZE; ptr = qp; error = gfs2_write_buf_to_page(sdp, pg_beg, pg_off, ptr, nbytes - overflow); /* If there's an overflow, write the remaining bytes to the next page */ if (!error && overflow) error = gfs2_write_buf_to_page(sdp, pg_beg + 1, 0, ptr + nbytes - overflow, overflow); return error; } /** * gfs2_adjust_quota - adjust record of current block usage * @sdp: The superblock * @loc: Offset of the entry in the quota file * @change: The amount of usage change to record * @qd: The quota data * @fdq: The updated limits to record * * This function was mostly borrowed from gfs2_block_truncate_page which was * in turn mostly borrowed from ext3 * * Returns: 0 or -ve on error */ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc, s64 change, struct gfs2_quota_data *qd, struct qc_dqblk *fdq) { struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; struct gfs2_quota q; int err; u64 size; if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) return err; } memset(&q, 0, sizeof(struct gfs2_quota)); err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q)); if (err < 0) return err; loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ be64_add_cpu(&q.qu_value, change); if (((s64)be64_to_cpu(q.qu_value)) < 0) q.qu_value = 0; /* Never go negative on quota usage */ spin_lock(&qd->qd_lockref.lock); qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & QC_SPC_SOFT) { q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_warn = q.qu_warn; } if (fdq->d_fieldmask & QC_SPC_HARD) { q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_limit = q.qu_limit; } if (fdq->d_fieldmask & QC_SPACE) { q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_value = q.qu_value; } } spin_unlock(&qd->qd_lockref.lock); err = gfs2_write_disk_quota(sdp, &q, loc); if (!err) { size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } return err; } static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda, u64 sync_gen) { struct gfs2_sbd *sdp = (*qda)->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_alloc_parms ap = {}; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; struct gfs2_quota_data *qd; unsigned reserved; loff_t offset; unsigned int nalloc = 0, blocks; int error; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); ghs = kmalloc_objs(struct gfs2_holder, num_qd, GFP_NOFS); if (!ghs) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); if (error) goto out_dq; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) goto out_dq; for (x = 0; x < num_qd; x++) { offset = qd2offset(qda[x]); if (gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota))) nalloc++; } /* * 1 blk for unstuffing inode if stuffed. We add this extra * block to the reservation unconditionally. If the inode * doesn't need unstuffing, the block will be released to the * rgrp since it won't be allocated during the transaction */ /* +3 in the end for unstuffing block, inode size update block * and another block in case quota straddles page boundary and * two blocks need to be updated instead of 1 */ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; reserved = 1 + (nalloc * (data_blocks + ind_blocks)); ap.target = reserved; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_alloc; if (nalloc) blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto out_ipres; for (x = 0; x < num_qd; x++) { qd = qda[x]; offset = qd2offset(qd); error = gfs2_adjust_quota(sdp, offset, qd->qd_change_sync, qd, NULL); if (error) goto out_end_trans; do_qc(qd, -qd->qd_change_sync); set_bit(QDF_REFRESH, &qd->qd_flags); } out_end_trans: gfs2_trans_end(sdp); out_ipres: gfs2_inplace_release(ip); out_alloc: gfs2_glock_dq_uninit(&i_gh); out_dq: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(glock_sbd(ip->i_gl), ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); if (!error) { for (x = 0; x < num_qd; x++) { qd = qda[x]; spin_lock(&qd->qd_lockref.lock); if (qd->qd_sync_gen < sync_gen) qd->qd_sync_gen = sync_gen; spin_unlock(&qd->qd_lockref.lock); } } return error; } static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) { struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_quota q; struct gfs2_quota_lvb *qlvb; loff_t pos; int error; memset(&q, 0, sizeof(struct gfs2_quota)); pos = qd2offset(qd); error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q)); if (error < 0) return error; qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); qlvb->__pad = 0; qlvb->qb_limit = q.qu_limit; qlvb->qb_warn = q.qu_warn; qlvb->qb_value = q.qu_value; spin_lock(&qd->qd_lockref.lock); qd->qd_qb = *qlvb; spin_unlock(&qd->qd_lockref.lock); return 0; } static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; int error; gfs2_assert_warn(sdp, sdp == glock_sbd(qd->qd_gl)); restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); if (error) return error; if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) force_refresh = FORCE; spin_lock(&qd->qd_lockref.lock); qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; spin_unlock(&qd->qd_lockref.lock); if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { gfs2_glock_dq_uninit(q_gh); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, q_gh); if (error) return error; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); if (error) goto fail; error = update_qd(sdp, qd); if (error) goto fail_gunlock; gfs2_glock_dq_uninit(&i_gh); gfs2_glock_dq_uninit(q_gh); force_refresh = 0; goto restart; } return 0; fail_gunlock: gfs2_glock_dq_uninit(&i_gh); fail: gfs2_glock_dq_uninit(q_gh); return error; } int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; u32 x; int error; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; error = gfs2_quota_hold(ip, uid, gid); if (error) return error; sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num, sizeof(struct gfs2_quota_data *), sort_qd, NULL); for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]); if (error) break; } if (!error) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); gfs2_quota_unhold(ip); } return error; } static bool need_sync(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_tune *gt = &sdp->sd_tune; s64 value, change, limit; unsigned int num, den; int ret = false; spin_lock(&qd->qd_lockref.lock); if (!qd->qd_qb.qb_limit) goto out; change = qd->qd_change; if (change <= 0) goto out; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); if (value >= limit) goto out; spin_lock(>->gt_spin); num = gt->gt_quota_scale_num; den = gt->gt_quota_scale_den; spin_unlock(>->gt_spin); change *= gfs2_jindex_size(sdp) * num; change = div_s64(change, den); if (value + change < limit) goto out; ret = true; out: spin_unlock(&qd->qd_lockref.lock); return ret; } void gfs2_quota_unlock(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS]; unsigned int count = 0; u32 x; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; bool sync; int error; qd = ip->i_qadata->qa_qd[x]; sync = need_sync(qd); gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); if (!sync) continue; spin_lock(&qd_lock); sync = qd_grab_sync(sdp, qd, U64_MAX); spin_unlock(&qd_lock); if (!sync) continue; gfs2_assert_warn(sdp, qd->qd_change_sync); error = bh_get(qd); if (error) { qd_ungrab_sync(qd); continue; } qda[count++] = qd; } if (count) { u64 sync_gen = READ_ONCE(sdp->sd_quota_sync_gen); do_sync(count, qda, sync_gen); for (x = 0; x < count; x++) qd_unlock(qda[x]); } gfs2_quota_unhold(ip); } #define MAX_LINE 256 static void print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_sbd; if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) { fs_info(sdp, "quota %s for %s %u\n", type, (qd->qd_id.type == USRQUOTA) ? "user" : "group", from_kqid(&init_user_ns, qd->qd_id)); } } /** * gfs2_quota_check - check if allocating new blocks will exceed quota * @ip: The inode for which this check is being performed * @uid: The uid to check against * @gid: The gid to check against * @ap: The allocation parameters. ap->target contains the requested * blocks. ap->min_target, if set, contains the minimum blks * requested. * * Returns: 0 on success. * min_req = ap->min_target ? ap->min_target : ap->target; * quota must allow at least min_req blks for success and * ap->allowed is set to the number of blocks allowed * * -EDQUOT otherwise, quota violation. ap->allowed is set to number * of blocks available. */ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; s64 value, warn, limit; u32 x; int error = 0; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid)))) continue; spin_lock(&qd->qd_lockref.lock); warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); value = (s64)be64_to_cpu(qd->qd_qb.qb_value); value += qd->qd_change; spin_unlock(&qd->qd_lockref.lock); if (limit > 0 && (limit - value) < ap->allowed) ap->allowed = limit - value; /* If we can't meet the target */ if (limit && limit < (value + (s64)ap->target)) { /* If no min_target specified or we don't meet * min_target, return -EDQUOT */ if (!ap->min_target || ap->min_target > ap->allowed) { if (!test_and_set_bit(QDF_QMSG_QUIET, &qd->qd_flags)) { print_message(qd, "exceeded"); quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); } error = -EDQUOT; break; } } else if (warn && warn < value && time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); print_message(qd, "warning"); error = 0; qd->qd_last_warn = jiffies; } } return error; } void gfs2_quota_change(struct gfs2_inode *ip, s64 change, kuid_t uid, kgid_t gid) { struct gfs2_quota_data *qd; u32 x; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF || gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; if (gfs2_assert_withdraw(sdp, ip->i_qadata && ip->i_qadata->qa_ref > 0)) return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid))) { do_qc(qd, change); } } } int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder); u64 sync_gen; int error = 0; if (sb_rdonly(sdp->sd_vfs)) return 0; qda = kzalloc_objs(struct gfs2_quota_data *, max_qd); if (!qda) return -ENOMEM; mutex_lock(&sdp->sd_quota_sync_mutex); sync_gen = sdp->sd_quota_sync_gen + 1; do { struct gfs2_quota_data *iter; unsigned int num_qd = 0; unsigned int x; spin_lock(&qd_lock); list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) { if (qd_grab_sync(sdp, iter, sync_gen)) { qda[num_qd++] = iter; if (num_qd == max_qd) break; } } spin_unlock(&qd_lock); if (!num_qd) break; for (x = 0; x < num_qd; x++) { error = bh_get(qda[x]); if (!error) continue; while (x < num_qd) qd_ungrab_sync(qda[--num_qd]); break; } if (!error) { WRITE_ONCE(sdp->sd_quota_sync_gen, sync_gen); error = do_sync(num_qd, qda, sync_gen); } for (x = 0; x < num_qd; x++) qd_unlock(qda[x]); } while (!error); mutex_unlock(&sdp->sd_quota_sync_mutex); kfree(qda); return error; } int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; error = qd_get(sdp, qid, &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); if (!error) gfs2_glock_dq_uninit(&q_gh); qd_put(qd); return error; } int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); u64 size = i_size_read(sdp->sd_qc_inode); unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; unsigned int hash; unsigned int bm_size; struct buffer_head *bh; u64 dblock; u32 extlen = 0; int error; if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) return -EIO; sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); bm_size *= sizeof(unsigned long); error = -ENOMEM; sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; for (x = 0; x < blocks; x++) { struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { extlen = 32; error = gfs2_get_extent(&ip->i_inode, x, &dblock, &extlen); if (error) goto fail; } error = -EIO; bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); if (!bh) goto fail; if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) goto fail_brelse; qc = (struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { struct gfs2_quota_data *old_qd, *qd; s64 qc_change = be64_to_cpu(qc->qc_change); u32 qc_flags = be32_to_cpu(qc->qc_flags); enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? USRQUOTA : GRPQUOTA; struct kqid qc_id = make_kqid(&init_user_ns, qtype, be32_to_cpu(qc->qc_id)); qc++; if (!qc_change) continue; hash = gfs2_qd_hash(sdp, qc_id); qd = qd_alloc(hash, sdp, qc_id); if (qd == NULL) goto fail_brelse; qd->qd_lockref.count = 0; set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_ref = 1; spin_lock(&qd_lock); spin_lock_bucket(hash); old_qd = gfs2_qd_search_bucket(hash, sdp, qc_id); if (old_qd) { fs_err(sdp, "Corruption found in quota_change%u" "file: duplicate identifier in " "slot %u\n", sdp->sd_jdesc->jd_jid, slot); spin_unlock_bucket(hash); spin_unlock(&qd_lock); qd_put(old_qd); gfs2_glock_put(qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, qd); /* zero out the duplicate slot */ lock_buffer(bh); memset(qc, 0, sizeof(*qc)); mark_buffer_dirty(bh); unlock_buffer(bh); continue; } BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); spin_unlock_bucket(hash); spin_unlock(&qd_lock); found++; } if (buffer_dirty(bh)) sync_dirty_buffer(bh); brelse(bh); dblock++; extlen--; } if (found) fs_info(sdp, "found %u quota changes\n", found); return 0; fail_brelse: if (buffer_dirty(bh)) sync_dirty_buffer(bh); brelse(bh); fail: gfs2_quota_cleanup(sdp); return error; } void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct gfs2_quota_data *qd; LIST_HEAD(dispose); int count; BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { spin_lock(&qd->qd_lockref.lock); if (qd->qd_lockref.count != 0) { spin_unlock(&qd->qd_lockref.lock); continue; } lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); gfs2_qd_list_dispose(&dispose); wait_event_timeout(sdp->sd_kill_wait, (count = atomic_read(&sdp->sd_quota_count)) == 0, HZ * 60); if (count != 0) fs_err(sdp, "%d left-over quota data objects\n", count); kvfree(sdp->sd_quota_bitmap); sdp->sd_quota_bitmap = NULL; } static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; if (!gfs2_withdrawn(sdp)) { if (!cmpxchg(&sdp->sd_log_error, 0, error)) fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); wake_up(&sdp->sd_logd_waitq); } } void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { if (!sdp->sd_statfs_force_sync) { sdp->sd_statfs_force_sync = 1; wake_up(&sdp->sd_quota_wait); } } /** * gfs2_quotad - Write cached quota changes into the quota file * @data: Pointer to GFS2 superblock * */ int gfs2_quotad(void *data) { struct gfs2_sbd *sdp = data; unsigned long now = jiffies; unsigned long statfs_deadline = now; unsigned long quotad_deadline = now; set_freezable(); while (!kthread_should_stop()) { unsigned long t; if (gfs2_withdrawn(sdp)) break; now = jiffies; if (sdp->sd_statfs_force_sync || time_after(now, statfs_deadline)) { unsigned int quantum; int error; /* Update the master statfs file */ error = gfs2_statfs_sync(sdp->sd_vfs, 0); quotad_error(sdp, "statfs", error); quantum = gfs2_tune_get(sdp, gt_statfs_quantum); statfs_deadline = now + quantum * HZ; } if (time_after(now, quotad_deadline)) { unsigned int quantum; int error; /* Update the quota file */ error = gfs2_quota_sync(sdp->sd_vfs, 0); quotad_error(sdp, "sync", error); quantum = gfs2_tune_get(sdp, gt_quota_quantum); quotad_deadline = now + quantum * HZ; } t = min(statfs_deadline - now, quotad_deadline - now); wait_event_freezable_timeout(sdp->sd_quota_wait, sdp->sd_statfs_force_sync || gfs2_withdrawn(sdp) || kthread_should_stop(), t); if (sdp->sd_statfs_force_sync) t = 0; } return 0; } static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) { struct gfs2_sbd *sdp = sb->s_fs_info; memset(state, 0, sizeof(*state)); switch (sdp->sd_args.ar_quota) { case GFS2_QUOTA_QUIET: fallthrough; case GFS2_QUOTA_ON: state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; fallthrough; case GFS2_QUOTA_ACCOUNT: state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | QCI_SYSFILE; state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED | QCI_SYSFILE; break; case GFS2_QUOTA_OFF: break; } if (sdp->sd_quota_inode) { state->s_state[USRQUOTA].ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks; } state->s_state[USRQUOTA].nextents = 1; /* unsupported */ state->s_state[GRPQUOTA] = state->s_state[USRQUOTA]; state->s_incoredqs = list_lru_count(&gfs2_qd_lru); return 0; } static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_lvb *qlvb; struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; memset(fdq, 0, sizeof(*fdq)); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ if ((qid.type != USRQUOTA) && (qid.type != GRPQUOTA)) return -EINVAL; error = qd_get(sdp, qid, &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); if (error) goto out; qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift; fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift; fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift; gfs2_glock_dq_uninit(&q_gh); out: qd_put(qd); return error; } /* GFS2 only supports a subset of the XFS fields */ #define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE) static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_quota_data *qd; struct gfs2_holder q_gh, i_gh; unsigned int data_blocks, ind_blocks; unsigned int blocks = 0; int alloc_required; loff_t offset; int error; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ if ((qid.type != USRQUOTA) && (qid.type != GRPQUOTA)) return -EINVAL; if (fdq->d_fieldmask & ~GFS2_FIELDMASK) return -EINVAL; error = qd_get(sdp, qid, &qd); if (error) return error; error = gfs2_qa_get(ip); if (error) goto out_put; inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) goto out_q; /* Check for existing entry, if none then alloc new blocks */ error = update_qd(sdp, qd); if (error) goto out_i; /* If nothing has changed, this is a no-op */ if ((fdq->d_fieldmask & QC_SPC_SOFT) && ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= QC_SPC_SOFT; if ((fdq->d_fieldmask & QC_SPC_HARD) && ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= QC_SPC_HARD; if ((fdq->d_fieldmask & QC_SPACE) && ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value))) fdq->d_fieldmask ^= QC_SPACE; if (fdq->d_fieldmask == 0) goto out_i; offset = qd2offset(qd); alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { struct gfs2_alloc_parms ap = {}; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; ap.target = blocks; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_i; blocks += gfs2_rg_blocks(ip, blocks); } /* Some quotas span block boundaries and can update two blocks, adding an extra block to the transaction to handle such quotas */ error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); if (error) goto out_release; /* Apply changes */ error = gfs2_adjust_quota(sdp, offset, 0, qd, fdq); if (!error) clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); gfs2_trans_end(sdp); out_release: if (alloc_required) gfs2_inplace_release(ip); out_i: gfs2_glock_dq_uninit(&i_gh); out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: gfs2_qa_put(ip); inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; } const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, .get_state = gfs2_quota_get_state, .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; void __init gfs2_quota_hash_init(void) { unsigned i; for(i = 0; i < GFS2_QD_HASH_SIZE; i++) INIT_HLIST_BL_HEAD(&qd_hash_table[i]); } |
| 67 66 64 11 2 11 9 55 46 64 64 12 5 61 2 59 55 59 54 31 59 67 24 289 289 291 291 289 2 2 2 291 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET implementation * * Authors: * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> */ #include <linux/types.h> #include <linux/module.h> #include <linux/in.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/sock_reuseport.h> int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct flowi4 *fl4; struct rtable *rt; __be32 saddr; int oif; int err; if (addr_len < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; sk_dst_reset(sk); oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!oif) { oif = READ_ONCE(inet->uc_index); } fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif, sk->sk_protocol, inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); goto out; } if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); err = -EACCES; goto out; } /* Update addresses before rehashing */ inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); atomic_set(&inet->inet_id, get_random_u16()); sk_dst_set(sk, &rt->dst); err = 0; out: return err; } EXPORT_SYMBOL(__ip4_datagram_connect); int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip4_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL(ip4_datagram_connect); /* Because UDP xmit path can manipulate sk_dst_cache without holding * socket lock, we need to use sk_dst_set() here, * even if we own the socket lock. */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) { rcu_read_unlock(); return; } inet_sk_init_flowi4(inet, &fl4); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); |
| 3 3 3 3 3 3 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 2 3 3 3 3 3 3 3 3 3 3 2 2 1 1 3 3 3 1 1 2 3 3 3 1 1 1 1 1 1 1 3 1 4 4 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ #include "dm-core.h" #include "dm-rq.h" #include "dm-uevent.h" #include "dm-ima.h" #include <linux/bio-integrity.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/blkpg.h> #include <linux/bio.h> #include <linux/mempool.h> #include <linux/dax.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/uio.h> #include <linux/hdreg.h> #include <linux/delay.h> #include <linux/wait.h> #include <linux/pr.h> #include <linux/refcount.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> #include <linux/blk-crypto-profile.h> #define DM_MSG_PREFIX "core" /* * Cookies are numeric values sent with CHANGE and REMOVE * uevents while resuming, removing or renaming the device. */ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 /* * For REQ_POLLED fs bio, this flag is set if we link mapped underlying * dm_io into one list, and reuse bio->bi_private as the list head. Before * ending this fs bio, we will recover its ->bi_private. */ #define REQ_DM_POLL_LIST REQ_DRV static const char *_name = DM_NAME; static unsigned int major; static unsigned int _major; static DEFINE_IDR(_minor_idr); static DEFINE_SPINLOCK(_minor_lock); static void do_deferred_remove(struct work_struct *w); static DECLARE_WORK(deferred_remove_work, do_deferred_remove); static struct workqueue_struct *deferred_remove_workqueue; atomic_t dm_global_event_nr = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); void dm_issue_global_event(void) { atomic_inc(&dm_global_event_nr); wake_up(&dm_global_eventq); } DEFINE_STATIC_KEY_FALSE(stats_enabled); DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); DEFINE_STATIC_KEY_FALSE(zoned_enabled); /* * One of these is allocated (on-stack) per original bio. */ struct clone_info { struct dm_table *map; struct bio *bio; struct dm_io *io; sector_t sector; unsigned int sector_count; bool is_abnormal_io:1; bool submit_as_polled:1; }; static inline struct dm_target_io *clone_to_tio(struct bio *clone) { return container_of(clone, struct dm_target_io, clone); } void *dm_per_bio_data(struct bio *bio, size_t data_size) { if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO)) return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; return (char *)bio - DM_IO_BIO_OFFSET - data_size; } EXPORT_SYMBOL_GPL(dm_per_bio_data); struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) { struct dm_io *io = (struct dm_io *)((char *)data + data_size); if (io->magic == DM_IO_MAGIC) return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); BUG_ON(io->magic != DM_TIO_MAGIC); return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); } EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); unsigned int dm_bio_get_target_bio_nr(const struct bio *bio) { return container_of(bio, struct dm_target_io, clone)->target_bio_nr; } EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define MINOR_ALLOCED ((void *)-1) #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) static int swap_bios = DEFAULT_SWAP_BIOS; static int get_swap_bios(void) { int latch = READ_ONCE(swap_bios); if (unlikely(latch <= 0)) latch = DEFAULT_SWAP_BIOS; return latch; } struct table_device { struct list_head list; refcount_t count; struct dm_dev dm_dev; }; /* * Bio-based DM's mempools' reserved IOs set by the user. */ #define RESERVED_BIO_BASED_IOS 16 static unsigned int reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; static int __dm_get_module_param_int(int *module_param, int min, int max) { int param = READ_ONCE(*module_param); int modified_param = 0; bool modified = true; if (param < min) modified_param = min; else if (param > max) modified_param = max; else modified = false; if (modified) { (void)cmpxchg(module_param, param, modified_param); param = modified_param; } return param; } unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max) { unsigned int param = READ_ONCE(*module_param); unsigned int modified_param = 0; if (!param) modified_param = def; else if (param > max) modified_param = max; if (modified_param) { (void)cmpxchg(module_param, param, modified_param); param = modified_param; } return param; } unsigned int dm_get_reserved_bio_based_ios(void) { return __dm_get_module_param(&reserved_bio_based_ios, RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); } EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); static unsigned int dm_get_numa_node(void) { return __dm_get_module_param_int(&dm_numa_node, DM_NUMA_NODE, num_online_nodes() - 1); } static int __init local_init(void) { int r; r = dm_uevent_init(); if (r) return r; deferred_remove_workqueue = alloc_ordered_workqueue("kdmremove", 0); if (!deferred_remove_workqueue) { r = -ENOMEM; goto out_uevent_exit; } _major = major; r = register_blkdev(_major, _name); if (r < 0) goto out_free_workqueue; if (!_major) _major = r; return 0; out_free_workqueue: destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); return r; } static void local_exit(void) { destroy_workqueue(deferred_remove_workqueue); unregister_blkdev(_major, _name); dm_uevent_exit(); _major = 0; DMINFO("cleaned up"); } static int (*_inits[])(void) __initdata = { local_init, dm_target_init, dm_linear_init, dm_stripe_init, dm_io_init, dm_kcopyd_init, dm_interface_init, dm_statistics_init, }; static void (*_exits[])(void) = { local_exit, dm_target_exit, dm_linear_exit, dm_stripe_exit, dm_io_exit, dm_kcopyd_exit, dm_interface_exit, dm_statistics_exit, }; static int __init dm_init(void) { const int count = ARRAY_SIZE(_inits); int r, i; #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) DMINFO("CONFIG_IMA_DISABLE_HTABLE is disabled." " Duplicate IMA measurements will not be recorded in the IMA log."); #endif for (i = 0; i < count; i++) { r = _inits[i](); if (r) goto bad; } return 0; bad: while (i--) _exits[i](); return r; } static void __exit dm_exit(void) { int i = ARRAY_SIZE(_exits); while (i--) _exits[i](); /* * Should be empty by this point. */ idr_destroy(&_minor_idr); } /* * Block device functions */ int dm_deleting_md(struct mapped_device *md) { return test_bit(DMF_DELETING, &md->flags); } static int dm_blk_open(struct gendisk *disk, blk_mode_t mode) { struct mapped_device *md; spin_lock(&_minor_lock); md = disk->private_data; if (!md) goto out; if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { md = NULL; goto out; } dm_get(md); atomic_inc(&md->open_count); out: spin_unlock(&_minor_lock); return md ? 0 : -ENXIO; } static void dm_blk_close(struct gendisk *disk) { struct mapped_device *md; spin_lock(&_minor_lock); md = disk->private_data; if (WARN_ON(!md)) goto out; if (atomic_dec_and_test(&md->open_count) && (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) queue_work(deferred_remove_workqueue, &deferred_remove_work); dm_put(md); out: spin_unlock(&_minor_lock); } int dm_open_count(struct mapped_device *md) { return atomic_read(&md->open_count); } /* * Guarantees nothing is using the device before it's deleted. */ int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) { int r = 0; spin_lock(&_minor_lock); if (dm_open_count(md)) { r = -EBUSY; if (mark_deferred) set_bit(DMF_DEFERRED_REMOVE, &md->flags); } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) r = -EEXIST; else set_bit(DMF_DELETING, &md->flags); spin_unlock(&_minor_lock); return r; } int dm_cancel_deferred_remove(struct mapped_device *md) { int r = 0; spin_lock(&_minor_lock); if (test_bit(DMF_DELETING, &md->flags)) r = -EBUSY; else clear_bit(DMF_DEFERRED_REMOVE, &md->flags); spin_unlock(&_minor_lock); return r; } static void do_deferred_remove(struct work_struct *w) { dm_deferred_remove(); } static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev, unsigned int cmd, unsigned long arg, bool *forward) { struct dm_target *ti; struct dm_table *map; int r; retry: r = -ENOTTY; map = dm_get_live_table(md, srcu_idx); if (!map || !dm_table_get_size(map)) return r; /* We only support devices that have a single target */ if (map->num_targets != 1) return r; ti = dm_table_get_target(map, 0); if (!ti->type->prepare_ioctl) return r; if (dm_suspended_md(md)) return -EAGAIN; r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward); if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); fsleep(10000); goto retry; } return r; } static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) { dm_put_live_table(md, srcu_idx); } static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; int r, srcu_idx; bool forward = true; r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); if (!forward || r < 0) goto out; if (r > 0) { /* * Target determined this ioctl is being issued against a * subset of the parent bdev; require extra privileges. */ if (!capable(CAP_SYS_RAWIO)) { DMDEBUG_LIMIT( "%s: sending ioctl %x to DM device without required privilege.", current->comm, cmd); r = -ENOIOCTLCMD; goto out; } } if (!bdev->bd_disk->fops->ioctl) r = -ENOTTY; else r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); out: dm_unprepare_ioctl(md, srcu_idx); return r; } u64 dm_start_time_ns_from_clone(struct bio *bio) { return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); } EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio) { /* * If REQ_PREFLUSH set, don't account payload, it will be * submitted (and accounted) after this flush completes. */ if (io->requeue_flush_with_data) return 0; if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) return io->sectors; return bio_sectors(bio); } static void dm_io_acct(struct dm_io *io, bool end) { struct bio *bio = io->orig_bio; if (dm_io_flagged(io, DM_IO_BLK_STAT)) { if (!end) bdev_start_io_acct(bio->bi_bdev, bio_op(bio), io->start_time); else bdev_end_io_acct(bio->bi_bdev, bio_op(bio), dm_io_sectors(io, bio), io->start_time); } if (static_branch_unlikely(&stats_enabled) && unlikely(dm_stats_used(&io->md->stats))) { sector_t sector; if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) sector = bio_end_sector(bio) - io->sector_offset; else sector = bio->bi_iter.bi_sector; dm_stats_account_io(&io->md->stats, bio_data_dir(bio), sector, dm_io_sectors(io, bio), end, io->start_time, &io->stats_aux); } } static void __dm_start_io_acct(struct dm_io *io) { dm_io_acct(io, false); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) { /* * Ensure IO accounting is only ever started once. */ if (dm_io_flagged(io, DM_IO_ACCOUNTED)) return; /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { dm_io_set_flag(io, DM_IO_ACCOUNTED); } else { unsigned long flags; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ spin_lock_irqsave(&io->lock, flags); if (dm_io_flagged(io, DM_IO_ACCOUNTED)) { spin_unlock_irqrestore(&io->lock, flags); return; } dm_io_set_flag(io, DM_IO_ACCOUNTED); spin_unlock_irqrestore(&io->lock, flags); } __dm_start_io_acct(io); } static void dm_end_io_acct(struct dm_io *io) { dm_io_acct(io, true); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t gfp_mask) { struct dm_io *io; struct dm_target_io *tio; struct bio *clone; clone = bio_alloc_clone(NULL, bio, gfp_mask, &md->mempools->io_bs); if (unlikely(!clone)) return NULL; tio = clone_to_tio(clone); tio->flags = 0; dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); tio->io = NULL; io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; io->status = BLK_STS_OK; io->requeue_flush_with_data = false; /* one ref is for submission, the other is for completion */ atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); io->orig_bio = bio; io->md = md; spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; if (blk_queue_io_stat(md->queue)) dm_io_set_flag(io, DM_IO_BLK_STAT); if (static_branch_unlikely(&stats_enabled) && unlikely(dm_stats_used(&md->stats))) dm_stats_record_start(&md->stats, &io->stats_aux); return io; } static void free_io(struct dm_io *io) { bio_put(&io->tio.clone); } static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, unsigned int target_bio_nr, unsigned int *len, gfp_t gfp_mask) { struct mapped_device *md = ci->io->md; struct dm_target_io *tio; struct bio *clone; if (!ci->io->tio.io) { /* the dm_target_io embedded in ci->io is available */ tio = &ci->io->tio; /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->mempools->bs); if (!clone) return NULL; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; tio = clone_to_tio(clone); tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */ } tio->magic = DM_TIO_MAGIC; tio->io = ci->io; tio->ti = ti; tio->target_bio_nr = target_bio_nr; tio->len_ptr = len; tio->old_sector = 0; /* Set default bdev, but target must bio_set_dev() before issuing IO */ clone->bi_bdev = md->disk->part0; if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev)) bio_set_dev(clone, md->disk->part0); if (len) { clone->bi_iter.bi_size = to_bytes(*len); if (bio_integrity(clone)) bio_integrity_trim(clone); } return clone; } static void free_tio(struct bio *clone) { if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) return; bio_put(clone); } /* * Add the bio to the list of deferred io. */ static void queue_io(struct mapped_device *md, struct bio *bio) { unsigned long flags; spin_lock_irqsave(&md->deferred_lock, flags); bio_list_add(&md->deferred, bio); spin_unlock_irqrestore(&md->deferred_lock, flags); queue_work(md->wq, &md->work); } /* * Everyone (including functions in this file), should use this * function to access the md->map field, and make sure they call * dm_put_live_table() when finished. */ struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) { *srcu_idx = srcu_read_lock(&md->io_barrier); return srcu_dereference(md->map, &md->io_barrier); } void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) { srcu_read_unlock(&md->io_barrier, srcu_idx); } void dm_sync_table(struct mapped_device *md) { synchronize_srcu(&md->io_barrier); synchronize_rcu_expedited(); } /* * A fast alternative to dm_get_live_table/dm_put_live_table. * The caller must not block between these two functions. */ static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) { rcu_read_lock(); return rcu_dereference(md->map); } static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) { rcu_read_unlock(); } static char *_dm_claim_ptr = "I belong to device-mapper"; /* * Open a table device so we can use it as a map destination. */ static struct table_device *open_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode) { struct table_device *td; struct file *bdev_file; struct block_device *bdev; u64 part_off; int r; td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); if (!td) return ERR_PTR(-ENOMEM); refcount_set(&td->count, 1); bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL); if (IS_ERR(bdev_file)) { r = PTR_ERR(bdev_file); goto out_free_td; } bdev = file_bdev(bdev_file); /* * We can be called before the dm disk is added. In that case we can't * register the holder relation here. It will be done once add_disk was * called. */ if (md->disk->slave_dir) { r = bd_link_disk_holder(bdev, md->disk); if (r) goto out_blkdev_put; } td->dm_dev.mode = mode; td->dm_dev.bdev = bdev; td->dm_dev.bdev_file = bdev_file; td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); format_dev_t(td->dm_dev.name, dev); list_add(&td->list, &md->table_devices); return td; out_blkdev_put: __fput_sync(bdev_file); out_free_td: kfree(td); return ERR_PTR(r); } /* * Close a table device that we've been using. */ static void close_table_device(struct table_device *td, struct mapped_device *md) { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); /* Leverage async fput() if DMF_DEFERRED_REMOVE set */ if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags))) fput(td->dm_dev.bdev_file); else __fput_sync(td->dm_dev.bdev_file); put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td); } static struct table_device *find_table_device(struct list_head *l, dev_t dev, blk_mode_t mode) { struct table_device *td; list_for_each_entry(td, l, list) if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) return td; return NULL; } int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode, struct dm_dev **result) { struct table_device *td; mutex_lock(&md->table_devices_lock); td = find_table_device(&md->table_devices, dev, mode); if (!td) { td = open_table_device(md, dev, mode); if (IS_ERR(td)) { mutex_unlock(&md->table_devices_lock); return PTR_ERR(td); } } else { refcount_inc(&td->count); } mutex_unlock(&md->table_devices_lock); *result = &td->dm_dev; return 0; } void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) { struct table_device *td = container_of(d, struct table_device, dm_dev); mutex_lock(&md->table_devices_lock); if (refcount_dec_and_test(&td->count)) close_table_device(td, md); mutex_unlock(&md->table_devices_lock); } /* * Get the geometry associated with a dm device */ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) { *geo = md->geometry; return 0; } /* * Set the geometry of a device. */ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) { sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; if (geo->start > sz) { DMERR("Start sector is beyond the geometry limits."); return -EINVAL; } md->geometry = *geo; return 0; } static int __noflush_suspending(struct mapped_device *md) { return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); } static void dm_requeue_add_io(struct dm_io *io, bool first_stage) { struct mapped_device *md = io->md; if (first_stage) { struct dm_io *next = md->requeue_list; md->requeue_list = io; io->next = next; } else { bio_list_add_head(&md->deferred, io->orig_bio); } } static void dm_kick_requeue(struct mapped_device *md, bool first_stage) { if (first_stage) queue_work(md->wq, &md->requeue_work); else queue_work(md->wq, &md->work); } /* * Return true if the dm_io's original bio is requeued. * io->status is updated with error if requeue disallowed. */ static bool dm_handle_requeue(struct dm_io *io, bool first_stage) { struct bio *bio = io->orig_bio; bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE); bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) && (bio->bi_opf & REQ_POLLED)); struct mapped_device *md = io->md; bool requeued = false; if (handle_requeue || handle_polled_eagain) { unsigned long flags; if (bio->bi_opf & REQ_POLLED) { /* * Upper layer won't help us poll split bio * (io->orig_bio may only reflect a subset of the * pre-split original) so clear REQ_POLLED. */ bio_clear_polled(bio); } /* * Target requested pushing back the I/O or * polled IO hit BLK_STS_AGAIN. */ spin_lock_irqsave(&md->deferred_lock, flags); if ((__noflush_suspending(md) && !WARN_ON_ONCE(dm_is_zone_write(md, bio))) || handle_polled_eagain || first_stage) { dm_requeue_add_io(io, first_stage); requeued = true; } else { /* * noflush suspend was interrupted or this is * a write to a zoned target. */ io->status = BLK_STS_IOERR; } spin_unlock_irqrestore(&md->deferred_lock, flags); } if (requeued) dm_kick_requeue(md, first_stage); return requeued; } static void __dm_io_complete(struct dm_io *io, bool first_stage) { struct bio *bio = io->orig_bio; struct mapped_device *md = io->md; blk_status_t io_error; bool requeued; bool requeue_flush_with_data; requeued = dm_handle_requeue(io, first_stage); if (requeued && first_stage) return; io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) dm_end_io_acct(io); else if (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap() */ __dm_start_io_acct(io); dm_end_io_acct(io); } requeue_flush_with_data = io->requeue_flush_with_data; free_io(io); smp_wmb(); this_cpu_dec(*md->pending_io); /* nudge anyone waiting on suspend queue */ if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); /* Return early if the original bio was requeued */ if (requeued) return; if (unlikely(requeue_flush_with_data)) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH. */ bio->bi_opf &= ~REQ_PREFLUSH; queue_io(md, bio); } else { /* done with normal IO or empty flush */ if (io_error) bio->bi_status = io_error; bio_endio(bio); } } static void dm_wq_requeue_work(struct work_struct *work) { struct mapped_device *md = container_of(work, struct mapped_device, requeue_work); unsigned long flags; struct dm_io *io; /* reuse deferred lock to simplify dm_handle_requeue */ spin_lock_irqsave(&md->deferred_lock, flags); io = md->requeue_list; md->requeue_list = NULL; spin_unlock_irqrestore(&md->deferred_lock, flags); while (io) { struct dm_io *next = io->next; dm_io_rewind(io, &md->disk->bio_split); io->next = NULL; __dm_io_complete(io, false); io = next; cond_resched(); } } /* * Two staged requeue: * * 1) io->orig_bio points to the real original bio, and the part mapped to * this io must be requeued, instead of other parts of the original bio. * * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. */ static inline void dm_io_complete(struct dm_io *io) { /* * Only dm_io that has been split needs two stage requeue, otherwise * we may run into long bio clone chain during suspend and OOM could * be triggered. * * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they * also aren't handled via the first stage requeue. */ __dm_io_complete(io, dm_io_flagged(io, DM_IO_WAS_SPLIT)); } /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ static inline void __dm_io_dec_pending(struct dm_io *io) { if (atomic_dec_and_test(&io->io_count)) dm_io_complete(io); } static void dm_io_set_error(struct dm_io *io, blk_status_t error) { unsigned long flags; /* Push-back supersedes any I/O errors */ spin_lock_irqsave(&io->lock, flags); if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(io->md))) { io->status = error; } spin_unlock_irqrestore(&io->lock, flags); } static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) { if (unlikely(error)) dm_io_set_error(io, error); __dm_io_dec_pending(io); } /* * The queue_limits are only valid as long as you have a reference * count on 'md'. But _not_ imposing verification to avoid atomic_read(), */ static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md) { return &md->queue->limits; } static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) { return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); } static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); struct dm_target *ti = tio->ti; dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL; struct dm_io *io = tio->io; struct mapped_device *md = io->md; if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && !bdev_max_discard_sectors(bio->bi_bdev)) blk_queue_disable_discard(md->queue); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bdev_write_zeroes_sectors(bio->bi_bdev)) blk_queue_disable_write_zeroes(md->queue); } if (static_branch_unlikely(&zoned_enabled) && unlikely(bdev_is_zoned(bio->bi_bdev))) dm_zone_endio(io, bio); if (endio) { int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: if (static_branch_unlikely(&zoned_enabled)) { /* * Requeuing writes to a sequential zone of a zoned * target will break the sequential write pattern: * fail such IO. */ if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) error = BLK_STS_IOERR; else error = BLK_STS_DM_REQUEUE; } else error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: break; case DM_ENDIO_INCOMPLETE: /* The target will handle the io */ return; default: DMCRIT("unimplemented target endio return value: %d", r); BUG(); } } if (static_branch_unlikely(&swap_bios_enabled) && likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); free_tio(bio); dm_io_dec_pending(io, error); } /* * Return maximum size of I/O possible at the supplied sector up to the current * target boundary. */ static inline sector_t max_io_len_target_boundary(struct dm_target *ti, sector_t target_offset) { return ti->len - target_offset; } static sector_t __max_io_len(struct dm_target *ti, sector_t sector, unsigned int max_granularity, unsigned int max_sectors) { sector_t target_offset = dm_target_offset(ti, sector); sector_t len = max_io_len_target_boundary(ti, target_offset); /* * Does the target need to split IO even further? * - varied (per target) IO splitting is a tenet of DM; this * explains why stacked chunk_sectors based splitting via * bio_split_to_limits() isn't possible here. */ if (!max_granularity) return len; return min_t(sector_t, len, min(max_sectors ? : queue_max_sectors(ti->table->md->queue), blk_boundary_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) { return __max_io_len(ti, sector, ti->max_io_len, 0); } int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) { if (len > UINT_MAX) { DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", (unsigned long long)len, UINT_MAX); ti->error = "Maximum size of target IO is too large"; return -EINVAL; } ti->max_io_len = (uint32_t) len; return 0; } EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, sector_t sector, int *srcu_idx) __acquires(md->io_barrier) { struct dm_table *map; struct dm_target *ti; map = dm_get_live_table(md, srcu_idx); if (!map) return NULL; ti = dm_table_find_target(map, sector); if (!ti) return NULL; return ti; } static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, unsigned long *pfn) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; struct dm_target *ti; long len, ret = -EIO; int srcu_idx; ti = dm_dax_get_live_target(md, sector, &srcu_idx); if (!ti) goto out; if (!ti->type->direct_access) goto out; len = max_io_len(ti, sector) / PAGE_SECTORS; if (len < 1) goto out; nr_pages = min(len, nr_pages); ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); out: dm_put_live_table(md, srcu_idx); return ret; } static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; struct dm_target *ti; int ret = -EIO; int srcu_idx; ti = dm_dax_get_live_target(md, sector, &srcu_idx); if (!ti) goto out; if (WARN_ON(!ti->type->dax_zero_page_range)) { /* * ->zero_page_range() is mandatory dax operation. If we are * here, something is wrong. */ goto out; } ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); out: dm_put_live_table(md, srcu_idx); return ret; } static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; struct dm_target *ti; int srcu_idx; long ret = 0; ti = dm_dax_get_live_target(md, sector, &srcu_idx); if (!ti || !ti->type->dax_recovery_write) goto out; ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); out: dm_put_live_table(md, srcu_idx); return ret; } /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management * operations, zone append writes (native with REQ_OP_ZONE_APPEND or emulated * with write BIOs flagged with BIO_EMULATES_ZONE_APPEND) and any bio serviced * by __send_duplicate_bios(). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be * sent in a next bio. * * A diagram that explains the arithmetics: * +--------------------+---------------+-------+ * | 1 | 2 | 3 | * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. * (it may be empty if the target doesn't use bio_advance) * Region 2 is the remaining bio size that the target wants to process. * (it may be empty if region 1 is non-empty, although there is no reason * to make it empty) * The target requires that region 3 is to be sent in the next bio. * * If the target wants to receive multiple copies of the bio (via num_*bios, etc), * the partially processed part (the sum of regions 1+2) must be the same for all * copies of the bio. */ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); struct dm_io *io = tio->io; unsigned int bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(bio_sectors > *tio->len_ptr); BUG_ON(n_sectors > bio_sectors); BUG_ON(bio->bi_opf & REQ_ATOMIC); if (static_branch_unlikely(&zoned_enabled) && unlikely(bdev_is_zoned(bio->bi_bdev))) { enum req_op op = bio_op(bio); BUG_ON(op_is_zone_mgmt(op)); BUG_ON(op == REQ_OP_WRITE); BUG_ON(op == REQ_OP_WRITE_ZEROES); BUG_ON(op == REQ_OP_ZONE_APPEND); } *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; /* * __split_and_process_bio() may have already saved mapped part * for accounting but it is being reduced so update accordingly. */ dm_io_set_flag(io, DM_IO_WAS_SPLIT); io->sectors = n_sectors; io->sector_offset = bio_sectors(io->orig_bio); } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); /* * @clone: clone bio that DM core passed to target's .map function * @tgt_clone: clone of @clone bio that target needs submitted * * Targets should use this interface to submit bios they take * ownership of when returning DM_MAPIO_SUBMITTED. * * Target should also enable ti->accounts_remapped_io */ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) { struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; /* establish bio that will get submitted */ if (!tgt_clone) tgt_clone = clone; bio_clone_blkg_association(tgt_clone, io->orig_bio); /* * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ dm_start_io_acct(io, clone); trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), tio->old_sector); submit_bio_noacct(tgt_clone); } EXPORT_SYMBOL_GPL(dm_submit_bio_remap); static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) { mutex_lock(&md->swap_bios_lock); while (latch < md->swap_bios) { cond_resched(); down(&md->swap_bios_semaphore); md->swap_bios--; } while (latch > md->swap_bios) { cond_resched(); up(&md->swap_bios_semaphore); md->swap_bios++; } mutex_unlock(&md->swap_bios_lock); } static void __map_bio(struct bio *clone) { struct dm_target_io *tio = clone_to_tio(clone); struct dm_target *ti = tio->ti; struct dm_io *io = tio->io; struct mapped_device *md = io->md; int r; clone->bi_end_io = clone_endio; /* * Map the clone. */ tio->old_sector = clone->bi_iter.bi_sector; if (static_branch_unlikely(&swap_bios_enabled) && unlikely(swap_bios_limit(ti, clone))) { int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); down(&md->swap_bios_semaphore); } if (likely(ti->type->map == linear_map)) r = linear_map(ti, clone); else if (ti->type->map == stripe_map) r = stripe_map(ti, clone); else r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: /* target has assumed ownership of this io */ if (!ti->accounts_remapped_io) dm_start_io_acct(io, clone); break; case DM_MAPIO_REMAPPED: dm_submit_bio_remap(clone, NULL); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: if (static_branch_unlikely(&swap_bios_enabled) && unlikely(swap_bios_limit(ti, clone))) up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) dm_io_dec_pending(io, BLK_STS_IOERR); else dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); break; default: DMCRIT("unimplemented target map return value: %d", r); BUG(); } } static void setup_split_accounting(struct clone_info *ci, unsigned int len) { struct dm_io *io = ci->io; if (ci->sector_count > len) { /* * Split needed, save the mapped part for accounting. * NOTE: dm_accept_partial_bio() will update accordingly. */ dm_io_set_flag(io, DM_IO_WAS_SPLIT); io->sectors = len; io->sector_offset = bio_sectors(ci->bio); } } static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, unsigned *len) { struct bio *bio; int try; for (try = 0; try < 2; try++) { int bio_nr; if (try && num_bios > 1) mutex_lock(&ci->io->md->table_devices_lock); for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { bio = alloc_tio(ci, ti, bio_nr, len, try ? GFP_NOIO : GFP_NOWAIT); if (!bio) break; bio_list_add(blist, bio); } if (try && num_bios > 1) mutex_unlock(&ci->io->md->table_devices_lock); if (bio_nr == num_bios) return; while ((bio = bio_list_pop(blist))) free_tio(bio); } } static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, unsigned int *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; unsigned int ret = 0; if (WARN_ON_ONCE(num_bios == 0)) /* num_bios = 0 is a bug in caller */ return 0; /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ if (len) setup_split_accounting(ci, *len); /* * Using alloc_multiple_bios(), even if num_bios is 1, to consistently * support allocating using GFP_NOWAIT with GFP_NOIO fallback. */ alloc_multiple_bios(&blist, ci, ti, num_bios, len); while ((clone = bio_list_pop(&blist))) { if (num_bios > 1) dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); __map_bio(clone); ret += 1; } return ret; } static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == (REQ_IDLE | REQ_SYNC)) opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; if (!t->flush_bypasses_map) { for (unsigned int i = 0; i < t->num_targets; i++) { unsigned int bios; struct dm_target *ti = dm_table_get_target(t, i); if (unlikely(ti->num_flush_bios == 0)) continue; atomic_add(ti->num_flush_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); } } else { /* * Note that there's no need to grab t->devices_lock here * because the targets that support flush optimization don't * modify the list of devices. */ struct list_head *devices = dm_table_get_devices(t); unsigned int len = 0; struct dm_dev_internal *dd; list_for_each_entry(dd, devices, list) { struct bio *clone; /* * Note that the structure dm_target_io is not * associated with any target (because the device may be * used by multiple targets), so we set tio->ti = NULL. * We must check for NULL in the I/O processing path, to * avoid NULL pointer dereference. */ clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO); atomic_add(1, &ci->io->io_count); bio_set_dev(clone, dd->dm_dev->bdev); clone->bi_end_io = clone_endio; dm_submit_bio_remap(clone, NULL); } } /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following subtraction */ atomic_sub(1, &ci->io->io_count); bio_uninit(ci->bio); } static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, unsigned int max_granularity, unsigned int max_sectors) { unsigned int len, bios; len = min_t(sector_t, ci->sector_count, __max_io_len(ti, ci->sector, max_granularity, max_sectors)); atomic_add(num_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, num_bios, &len); /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following (+1) subtraction */ atomic_sub(num_bios - bios + 1, &ci->io->io_count); ci->sector += len; ci->sector_count -= len; } static bool is_abnormal_io(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: case REQ_OP_FLUSH: return false; case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET_ALL: return true; default: return false; } } static blk_status_t __process_abnormal_io(struct clone_info *ci, struct dm_target *ti) { unsigned int num_bios = 0; unsigned int max_granularity = 0; unsigned int max_sectors = 0; struct queue_limits *limits = dm_get_queue_limits(ti->table->md); switch (bio_op(ci->bio)) { case REQ_OP_DISCARD: num_bios = ti->num_discard_bios; max_sectors = limits->max_discard_sectors; if (ti->max_discard_granularity) max_granularity = max_sectors; break; case REQ_OP_SECURE_ERASE: num_bios = ti->num_secure_erase_bios; max_sectors = limits->max_secure_erase_sectors; break; case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; max_sectors = limits->max_write_zeroes_sectors; break; default: break; } /* * Even though the device advertised support for this type of * request, that does not mean every target supports it, and * reconfiguration might also have changed that since the * check was performed. */ if (unlikely(!num_bios)) return BLK_STS_NOTSUPP; __send_abnormal_io(ci, ti, num_bios, max_granularity, max_sectors); return BLK_STS_OK; } /* * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * * bio->bi_private is owned by fs or upper layer, so block layer won't * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe. */ static inline struct dm_io **dm_poll_list_head(struct bio *bio) { return (struct dm_io **)&bio->bi_private; } static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) { struct dm_io **head = dm_poll_list_head(bio); if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse * .bi_private as dm_io list head for storing dm_io list */ io->data = bio->bi_private; /* tell block layer to poll for completion */ bio->bi_cookie = ~BLK_QC_T_NONE; io->next = NULL; } else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too. */ io->data = (*head)->data; io->next = *head; } *head = io; } /* * Select the correct strategy for processing a non-flush bio. */ static blk_status_t __split_and_process_bio(struct clone_info *ci) { struct bio *clone; struct dm_target *ti; unsigned int len; ti = dm_table_find_target(ci->map, ci->sector); if (unlikely(!ti)) return BLK_STS_IOERR; if (unlikely(ci->is_abnormal_io)) return __process_abnormal_io(ci, ti); /* * Only support bio polling for normal IO, and the target io is * exactly inside the dm_io instance (verified in dm_poll_dm_io) */ ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED); len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); if (ci->bio->bi_opf & REQ_ATOMIC) { if (unlikely(!dm_target_supports_atomic_writes(ti->type))) return BLK_STS_IOERR; if (unlikely(len != ci->sector_count)) return BLK_STS_IOERR; } setup_split_accounting(ci, len); if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) { if (unlikely(!dm_target_supports_nowait(ti->type))) return BLK_STS_NOTSUPP; clone = alloc_tio(ci, ti, 0, &len, GFP_NOWAIT); if (unlikely(!clone)) return BLK_STS_AGAIN; } else { clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); } __map_bio(clone); ci->sector += len; ci->sector_count -= len; return BLK_STS_OK; } static void init_clone_info(struct clone_info *ci, struct dm_io *io, struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; ci->io = io; ci->bio = bio; ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; ci->sector = bio->bi_iter.bi_sector; ci->sector_count = bio_sectors(bio); /* Shouldn't happen but sector_count was being set to 0 so... */ if (static_branch_unlikely(&zoned_enabled) && WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) ci->sector_count = 0; } #ifdef CONFIG_BLK_DEV_ZONED static inline bool dm_zone_bio_needs_split(struct bio *bio) { /* * Special case the zone operations that cannot or should not be split. */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: case REQ_OP_ZONE_FINISH: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: return false; default: break; } /* * When mapped devices use the block layer zone write plugging, we must * split any large BIO to the mapped device limits to not submit BIOs * that span zone boundaries and to avoid potential deadlocks with * queue freeze operations. */ return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio); } static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { if (!bio_needs_zone_write_plugging(bio)) return false; return blk_zone_plug_bio(bio, 0); } static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, struct dm_target *ti) { struct bio_list blist = BIO_EMPTY_LIST; struct mapped_device *md = ci->io->md; unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors; unsigned long *need_reset; unsigned int i, nr_zones, nr_reset; unsigned int num_bios = 0; blk_status_t sts = BLK_STS_OK; sector_t sector = ti->begin; struct bio *clone; int ret; nr_zones = ti->len >> ilog2(zone_sectors); need_reset = bitmap_zalloc(nr_zones, GFP_NOIO); if (!need_reset) return BLK_STS_RESOURCE; ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin, nr_zones, need_reset); if (ret) { sts = BLK_STS_IOERR; goto free_bitmap; } /* If we have no zone to reset, we are done. */ nr_reset = bitmap_weight(need_reset, nr_zones); if (!nr_reset) goto free_bitmap; atomic_add(nr_zones, &ci->io->io_count); for (i = 0; i < nr_zones; i++) { if (!test_bit(i, need_reset)) { sector += zone_sectors; continue; } if (bio_list_empty(&blist)) { /* This may take a while, so be nice to others */ if (num_bios) cond_resched(); /* * We may need to reset thousands of zones, so let's * not go crazy with the clone allocation. */ alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32), NULL); } /* Get a clone and change it to a regular reset operation. */ clone = bio_list_pop(&blist); clone->bi_opf &= ~REQ_OP_MASK; clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC; clone->bi_iter.bi_sector = sector; clone->bi_iter.bi_size = 0; __map_bio(clone); sector += zone_sectors; num_bios++; nr_reset--; } WARN_ON_ONCE(!bio_list_empty(&blist)); atomic_sub(nr_zones - num_bios, &ci->io->io_count); ci->sector_count = 0; free_bitmap: bitmap_free(need_reset); return sts; } static void __send_zone_reset_all_native(struct clone_info *ci, struct dm_target *ti) { unsigned int bios; atomic_add(1, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, 1, NULL); atomic_sub(1 - bios, &ci->io->io_count); ci->sector_count = 0; } static blk_status_t __send_zone_reset_all(struct clone_info *ci) { struct dm_table *t = ci->map; blk_status_t sts = BLK_STS_OK; for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); if (ti->zone_reset_all_supported) { __send_zone_reset_all_native(ci, ti); continue; } sts = __send_zone_reset_all_emulated(ci, ti); if (sts != BLK_STS_OK) break; } /* Release the reference that alloc_io() took for submission. */ atomic_sub(1, &ci->io->io_count); return sts; } #else static inline bool dm_zone_bio_needs_split(struct bio *bio) { return false; } static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return false; } static blk_status_t __send_zone_reset_all(struct clone_info *ci) { return BLK_STS_NOTSUPP; } #endif /* * Entry point to split a bio into clones and submit them to the targets. */ static void dm_split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { struct clone_info ci; struct dm_io *io; blk_status_t error = BLK_STS_OK; bool is_abnormal, need_split; is_abnormal = is_abnormal_io(bio); if (static_branch_unlikely(&zoned_enabled)) { need_split = is_abnormal || dm_zone_bio_needs_split(bio); } else { need_split = is_abnormal; } if (unlikely(need_split)) { /* * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. * Also split the BIO for mapped devices needing zone append * emulation to ensure that the BIO does not cross zone * boundaries. */ bio = bio_split_to_limits(bio); if (!bio) return; } /* * Use the block layer zone write plugging for mapped devices that * need zone append emulation (e.g. dm-crypt). */ if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio)) return; /* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { /* * Don't support NOWAIT for FLUSH because it may allocate * multiple bios and there's no easy way how to undo the * allocations. */ if (bio->bi_opf & REQ_PREFLUSH) { bio_wouldblock_error(bio); return; } io = alloc_io(md, bio, GFP_NOWAIT); if (unlikely(!io)) { /* Unable to do anything without dm_io. */ bio_wouldblock_error(bio); return; } } else { io = alloc_io(md, bio, GFP_NOIO); } init_clone_info(&ci, io, map, bio, is_abnormal); if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) { /* * The "flush_bypasses_map" is set on targets where it is safe * to skip the map function and submit bios directly to the * underlying block devices - currently, it is set for dm-linear * and dm-stripe. * * If we have just one underlying device (i.e. there is one * linear target or multiple linear targets pointing to the same * device), we can send the flush with data directly to it. */ if (bio->bi_iter.bi_size && map->flush_bypasses_map) { struct list_head *devices = dm_table_get_devices(map); if (devices->next == devices->prev) goto send_preflush_with_data; } if (bio->bi_iter.bi_size) io->requeue_flush_with_data = true; __send_empty_flush(&ci); /* dm_io_complete submits any data associated with flush */ goto out; } send_preflush_with_data: if (static_branch_unlikely(&zoned_enabled) && (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { error = __send_zone_reset_all(&ci); goto out; } error = __split_and_process_bio(&ci); if (error || !ci.sector_count) goto out; /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. */ bio_trim(bio, io->sectors, ci.sector_count); trace_block_split(bio, bio->bi_iter.bi_sector); bio_inc_remaining(bio); submit_bio_noacct(bio); out: /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * * Add every dm_io instance into the dm_io list head which is stored * in bio->bi_private, so that dm_poll_bio can poll them all. */ if (error || !ci.submit_as_polled) { /* * In case of submission failure, the extra reference for * submitting io isn't consumed yet */ if (error) atomic_dec(&io->io_count); dm_io_dec_pending(io, error); } else dm_queue_poll_io(bio, io); } static void dm_submit_bio(struct bio *bio) { struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; int srcu_idx; struct dm_table *map; map = dm_get_live_table(md, &srcu_idx); if (unlikely(!map)) { DMERR_LIMIT("%s: mapping table unavailable, erroring io", dm_device_name(md)); bio_io_error(bio); goto out; } /* If suspended, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); else if (bio->bi_opf & REQ_RAHEAD) bio_io_error(bio); else queue_io(md, bio); goto out; } dm_split_and_process_bio(md, map, bio); out: dm_put_live_table(md, srcu_idx); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, unsigned int flags) { WARN_ON_ONCE(!dm_tio_is_normal(&io->tio)); /* don't poll if the mapped io is done */ if (atomic_read(&io->io_count) > 1) bio_poll(&io->tio.clone, iob, flags); /* bio_poll holds the last reference */ return atomic_read(&io->io_count) == 1; } static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { struct dm_io **head = dm_poll_list_head(bio); struct dm_io *list = *head; struct dm_io *tmp = NULL; struct dm_io *curr, *next; /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0; WARN_ON_ONCE(!list); /* * Restore .bi_private before possibly completing dm_io. * * bio_poll() is only possible once @bio has been completely * submitted via submit_bio_noacct()'s depth-first submission. * So there is no dm_queue_poll_io() race associated with * clearing REQ_DM_POLL_LIST here. */ bio->bi_opf &= ~REQ_DM_POLL_LIST; bio->bi_private = list->data; for (curr = list, next = curr->next; curr; curr = next, next = curr ? curr->next : NULL) { if (dm_poll_dm_io(curr, iob, flags)) { /* * clone_endio() has already occurred, so no * error handling is needed here. */ __dm_io_dec_pending(curr); } else { curr->next = tmp; tmp = curr; } } /* Not done? */ if (tmp) { bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */ *head = tmp; return 0; } return 1; } /* *--------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *--------------------------------------------------------------- */ static void free_minor(int minor) { spin_lock(&_minor_lock); idr_remove(&_minor_idr, minor); spin_unlock(&_minor_lock); } /* * See if the device with a specific minor # is free. */ static int specific_minor(int minor) { int r; if (minor >= (1 << MINORBITS)) return -EINVAL; idr_preload(GFP_KERNEL); spin_lock(&_minor_lock); r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); spin_unlock(&_minor_lock); idr_preload_end(); if (r < 0) return r == -ENOSPC ? -EBUSY : r; return 0; } static int next_free_minor(int *minor) { int r; idr_preload(GFP_KERNEL); spin_lock(&_minor_lock); r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); spin_unlock(&_minor_lock); idr_preload_end(); if (r < 0) return r; *minor = r; return 0; } static const struct block_device_operations dm_blk_dops; static const struct block_device_operations dm_rq_blk_dops; static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); #ifdef CONFIG_BLK_INLINE_ENCRYPTION static void dm_queue_destroy_crypto_profile(struct request_queue *q) { dm_destroy_crypto_profile(q->crypto_profile); } #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) { } #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) destroy_workqueue(md->wq); dm_free_md_mempools(md->mempools); if (md->dax_dev) { dax_remove_host(md->disk); kill_dax(md->dax_dev); put_dax(md->dax_dev); md->dax_dev = NULL; } if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; spin_unlock(&_minor_lock); if (dm_get_md_type(md) != DM_TYPE_NONE) { struct table_device *td; dm_sysfs_exit(md); list_for_each_entry(td, &md->table_devices, list) { bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); } /* * Hold lock to make sure del_gendisk() won't concurrent * with open/close_table_device(). */ mutex_lock(&md->table_devices_lock); del_gendisk(md->disk); mutex_unlock(&md->table_devices_lock); } dm_queue_destroy_crypto_profile(md->queue); put_disk(md->disk); } if (md->pending_io) { free_percpu(md->pending_io); md->pending_io = NULL; } cleanup_srcu_struct(&md->io_barrier); mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); } /* * Allocate and initialise a blank device with a given minor. */ static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); struct dax_device *dax_dev; struct mapped_device *md; void *old_md; md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMERR("unable to allocate device, out of memory."); return NULL; } if (!try_module_get(THIS_MODULE)) goto bad_module_get; /* get a minor number for the dev */ if (minor == DM_ANY_MINOR) r = next_free_minor(&minor); else r = specific_minor(minor); if (r < 0) goto bad_minor; r = init_srcu_struct(&md->io_barrier); if (r < 0) goto bad_io_barrier; md->numa_node_id = numa_node_id; md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); /* * default to bio-based until DM table is loaded and md->type * established. If request-based table is loaded: blk-mq will * override accordingly. */ md->disk = blk_alloc_disk(NULL, md->numa_node_id); if (IS_ERR(md->disk)) { md->disk = NULL; goto bad; } md->queue = md->disk->queue; init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); INIT_WORK(&md->requeue_work, dm_wq_requeue_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); md->requeue_list = NULL; md->swap_bios = get_swap_bios(); sema_init(&md->swap_bios_semaphore, md->swap_bios); mutex_init(&md->swap_bios_lock); md->disk->major = _major; md->disk->first_minor = minor; md->disk->minors = 1; md->disk->flags |= GENHD_FL_NO_PART; md->disk->fops = &dm_blk_dops; md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); dax_dev = alloc_dax(md, &dm_dax_ops); if (IS_ERR(dax_dev)) { if (PTR_ERR(dax_dev) != -EOPNOTSUPP) goto bad; } else { set_dax_nocache(dax_dev); set_dax_nomc(dax_dev); md->dax_dev = dax_dev; if (dax_add_host(dax_dev, md->disk)) goto bad; } format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM | WQ_PERCPU, 0, md->name); if (!md->wq) goto bad; md->pending_io = alloc_percpu(unsigned long); if (!md->pending_io) goto bad; r = dm_stats_init(&md->stats); if (r < 0) goto bad; /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); spin_unlock(&_minor_lock); BUG_ON(old_md != MINOR_ALLOCED); return md; bad: cleanup_mapped_device(md); bad_io_barrier: free_minor(minor); bad_minor: module_put(THIS_MODULE); bad_module_get: kvfree(md); return NULL; } static void unlock_fs(struct mapped_device *md); static void free_dev(struct mapped_device *md) { int minor = MINOR(disk_devt(md->disk)); unlock_fs(md); cleanup_mapped_device(md); WARN_ON_ONCE(!list_empty(&md->table_devices)); dm_stats_cleanup(&md->stats); free_minor(minor); module_put(THIS_MODULE); kvfree(md); } /* * Bind a table to the device. */ static void event_callback(void *context) { unsigned long flags; LIST_HEAD(uevents); struct mapped_device *md = context; spin_lock_irqsave(&md->uevent_lock, flags); list_splice_init(&md->uevent_list, &uevents); spin_unlock_irqrestore(&md->uevent_lock, flags); dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); wake_up(&md->eventq); dm_issue_global_event(); } /* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct queue_limits *limits) { struct dm_table *old_map; sector_t size, old_size; lockdep_assert_held(&md->suspend_lock); size = dm_table_get_size(t); old_size = dm_get_size(md); if (!dm_table_supports_size_change(t, old_size, size)) { old_map = ERR_PTR(-EINVAL); goto out; } set_capacity(md->disk, size); if (limits) { int ret = dm_table_set_restrictions(t, md->queue, limits); if (ret) { set_capacity(md->disk, old_size); old_map = ERR_PTR(ret); goto out; } } /* * Wipe any geometry if the size of the table changed. */ if (size != old_size) memset(&md->geometry, 0, sizeof(md->geometry)); dm_table_event_callback(t, event_callback, md); if (dm_table_request_based(t)) { /* * Leverage the fact that request-based DM targets are * immutable singletons - used to optimize dm_mq_queue_rq. */ md->immutable_target = dm_table_get_immutable_target(t); /* * There is no need to reload with request-based dm because the * size of front_pad doesn't change. * * Note for future: If you are to reload bioset, prep-ed * requests in the queue may refer to bio from the old bioset, * so you must walk through the queue to unprep. */ if (!md->mempools) md->mempools = t->mempools; else dm_free_md_mempools(t->mempools); } else { /* * The md may already have mempools that need changing. * If so, reload bioset because front_pad may have changed * because a different table was loaded. */ dm_free_md_mempools(md->mempools); md->mempools = t->mempools; } t->mempools = NULL; old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); if (old_map) dm_sync_table(md); out: return old_map; } /* * Returns unbound table for the caller to free. */ static struct dm_table *__unbind(struct mapped_device *md) { struct dm_table *map = rcu_dereference_protected(md->map, 1); if (!map) return NULL; dm_table_event_callback(map, NULL, NULL); RCU_INIT_POINTER(md->map, NULL); dm_sync_table(md); return map; } /* * Constructor for a new device. */ int dm_create(int minor, struct mapped_device **result) { struct mapped_device *md; md = alloc_dev(minor); if (!md) return -ENXIO; dm_ima_reset_data(md); *result = md; return 0; } /* * Functions to manage md->type. * All are required to hold md->type_lock. */ void dm_lock_md_type(struct mapped_device *md) { mutex_lock(&md->type_lock); } void dm_unlock_md_type(struct mapped_device *md) { mutex_unlock(&md->type_lock); } enum dm_queue_mode dm_get_md_type(struct mapped_device *md) { return md->type; } struct target_type *dm_get_immutable_target_type(struct mapped_device *md) { return md->immutable_target_type; } /* * Setup the DM device's queue based on md's type */ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { enum dm_queue_mode type = dm_table_get_type(t); struct queue_limits limits; struct table_device *td; int r; WARN_ON_ONCE(type == DM_TYPE_NONE); if (type == DM_TYPE_REQUEST_BASED) { md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } } r = dm_calculate_queue_limits(t, &limits); if (r) { DMERR("Cannot calculate initial queue limits"); return r; } r = dm_table_set_restrictions(t, md->queue, &limits); if (r) return r; /* * Hold lock to make sure add_disk() and del_gendisk() won't concurrent * with open_table_device() and close_table_device(). */ mutex_lock(&md->table_devices_lock); r = add_disk(md->disk); mutex_unlock(&md->table_devices_lock); if (r) return r; /* * Register the holder relationship for devices added before the disk * was live. */ list_for_each_entry(td, &md->table_devices, list) { r = bd_link_disk_holder(td->dm_dev.bdev, md->disk); if (r) goto out_undo_holders; } r = dm_sysfs_init(md); if (r) goto out_undo_holders; md->type = type; return 0; out_undo_holders: list_for_each_entry_continue_reverse(td, &md->table_devices, list) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); mutex_lock(&md->table_devices_lock); del_gendisk(md->disk); mutex_unlock(&md->table_devices_lock); return r; } struct mapped_device *dm_get_md(dev_t dev) { struct mapped_device *md; unsigned int minor = MINOR(dev); if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) return NULL; spin_lock(&_minor_lock); md = idr_find(&_minor_idr, minor); if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { md = NULL; goto out; } dm_get(md); out: spin_unlock(&_minor_lock); return md; } EXPORT_SYMBOL_GPL(dm_get_md); void *dm_get_mdptr(struct mapped_device *md) { return md->interface_ptr; } void dm_set_mdptr(struct mapped_device *md, void *ptr) { md->interface_ptr = ptr; } void dm_get(struct mapped_device *md) { atomic_inc(&md->holders); BUG_ON(test_bit(DMF_FREEING, &md->flags)); } int dm_hold(struct mapped_device *md) { spin_lock(&_minor_lock); if (test_bit(DMF_FREEING, &md->flags)) { spin_unlock(&_minor_lock); return -EBUSY; } dm_get(md); spin_unlock(&_minor_lock); return 0; } EXPORT_SYMBOL_GPL(dm_hold); const char *dm_device_name(struct mapped_device *md) { return md->name; } EXPORT_SYMBOL_GPL(dm_device_name); static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; int srcu_idx; might_sleep(); spin_lock(&_minor_lock); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); blk_mark_disk_dead(md->disk); /* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend. */ mutex_lock(&md->suspend_lock); map = dm_get_live_table(md, &srcu_idx); if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); } /* dm_put_live_table must be before fsleep, otherwise deadlock is possible */ dm_put_live_table(md, srcu_idx); mutex_unlock(&md->suspend_lock); /* * Rare, but there may be I/O requests still going to complete, * for example. Wait for all references to disappear. * No one should increment the reference count of the mapped_device, * after the mapped_device state becomes DMF_FREEING. */ if (wait) while (atomic_read(&md->holders)) fsleep(1000); else if (atomic_read(&md->holders)) DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", dm_device_name(md), atomic_read(&md->holders)); dm_table_destroy(__unbind(md)); free_dev(md); } void dm_destroy(struct mapped_device *md) { __dm_destroy(md, true); } void dm_destroy_immediate(struct mapped_device *md) { __dm_destroy(md, false); } void dm_put(struct mapped_device *md) { atomic_dec(&md->holders); } EXPORT_SYMBOL_GPL(dm_put); static bool dm_in_flight_bios(struct mapped_device *md) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) sum += *per_cpu_ptr(md->pending_io, cpu); return sum != 0; } static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) { int r = 0; DEFINE_WAIT(wait); while (true) { prepare_to_wait(&md->wait, &wait, task_state); if (!dm_in_flight_bios(md)) break; if (signal_pending_state(task_state, current)) { r = -ERESTARTSYS; break; } io_schedule(); } finish_wait(&md->wait, &wait); smp_rmb(); return r; } static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) { int r = 0; if (!queue_is_mq(md->queue)) return dm_wait_for_bios_completion(md, task_state); while (true) { if (!blk_mq_queue_inflight(md->queue)) break; if (signal_pending_state(task_state, current)) { r = -ERESTARTSYS; break; } fsleep(5000); } return r; } /* * Process the deferred bios */ static void dm_wq_work(struct work_struct *work) { struct mapped_device *md = container_of(work, struct mapped_device, work); struct bio *bio; while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); bio = bio_list_pop(&md->deferred); spin_unlock_irq(&md->deferred_lock); if (!bio) break; submit_bio_noacct(bio); cond_resched(); } } static void dm_queue_flush(struct mapped_device *md) { clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); smp_mb__after_atomic(); queue_work(md->wq, &md->work); } /* * Swap in a new table, returning the old one for the caller to destroy. */ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); struct queue_limits limits; bool update_limits = true; int r; mutex_lock(&md->suspend_lock); /* device must be suspended */ if (!dm_suspended_md(md)) goto out; /* * To avoid a potential deadlock locking the queue limits, disallow * updating the queue limits during a table swap, when updating an * immutable request-based dm device (dm-multipath) during a noflush * suspend. It is userspace's responsibility to make sure that the new * table uses the same limits as the existing table, if it asks for a * noflush suspend. */ if (dm_request_based(md) && md->immutable_target && __noflush_suspending(md)) update_limits = false; /* * If the new table has no data devices, retain the existing limits. * This helps multipath with queue_if_no_path if all paths disappear, * then new I/O is queued based on these limits, and then some paths * reappear. */ else if (dm_table_has_no_data_devices(table)) { live_map = dm_get_live_table_fast(md); if (live_map) limits = md->queue->limits; dm_put_live_table_fast(md); } if (update_limits && !live_map) { r = dm_calculate_queue_limits(table, &limits); if (r) { map = ERR_PTR(r); goto out; } } map = __bind(md, table, update_limits ? &limits : NULL); dm_issue_global_event(); out: mutex_unlock(&md->suspend_lock); return map; } /* * Functions to lock and unlock any filesystem running on the * device. */ static int lock_fs(struct mapped_device *md) { int r; WARN_ON(test_bit(DMF_FROZEN, &md->flags)); r = bdev_freeze(md->disk->part0); if (!r) set_bit(DMF_FROZEN, &md->flags); return r; } static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; bdev_thaw(md->disk->part0); clear_bit(DMF_FROZEN, &md->flags); } /* * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY * * If __dm_suspend returns 0, the device is completely quiescent * now. There is no request-processing activity. All new requests * are being added to md->deferred list. */ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, unsigned int suspend_flags, unsigned int task_state, int dmf_suspended_flag) { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; int r = 0; lockdep_assert_held(&md->suspend_lock); /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. */ if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); else DMDEBUG("%s: suspending with flush", dm_device_name(md)); /* * This gets reverted if there's an error later and the targets * provide the .presuspend_undo hook. */ dm_table_presuspend_targets(map); /* * Flush I/O to the device. * Any I/O submitted after lock_fs() may not be flushed. * noflush takes precedence over do_lockfs. * (lock_fs() flushes I/Os and waits for them to complete.) */ if (!noflush && do_lockfs) { r = lock_fs(md); if (r) { dm_table_presuspend_undo_targets(map); return r; } } /* * Here we must make sure that no processes are submitting requests * to target drivers i.e. no one may be executing * dm_split_and_process_bio from dm_submit_bio. * * To get all processes out of dm_split_and_process_bio in dm_submit_bio, * we take the write lock. To prevent any process from reentering * dm_split_and_process_bio from dm_submit_bio and quiesce the thread * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call * flush_workqueue(md->wq). */ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); if (map) synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ if (map && dm_request_based(md)) { dm_stop_queue(md->queue); set_bit(DMF_QUEUE_STOPPED, &md->flags); } flush_workqueue(md->wq); /* * At this point no more requests are entering target request routines. * We call dm_wait_for_completion to wait for all existing requests * to finish. */ if (map) r = dm_wait_for_completion(md, task_state); if (!r) set_bit(dmf_suspended_flag, &md->flags); if (map) synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { dm_queue_flush(md); if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); dm_table_presuspend_undo_targets(map); /* pushback list is already flushed, so skip flush */ } return r; } /* * We need to be able to change a mapping table under a mounted * filesystem. For example we might want to move some data in * the background. Before the table can be swapped with * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ /* * Suspend mechanism in request-based dm. * * 1. Flush all I/Os by lock_fs() if needed. * 2. Stop dispatching any I/O by stopping the request_queue. * 3. Wait for all in-flight I/Os to be completed or requeued. * * To abort suspend, start the request_queue. */ int dm_suspend(struct mapped_device *md, unsigned int suspend_flags) { struct dm_table *map = NULL; int r = 0; retry: mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); if (dm_suspended_md(md)) { r = -EINVAL; goto out_unlock; } if (dm_suspended_internally_md(md)) { /* already internally suspended, wait for internal resume */ mutex_unlock(&md->suspend_lock); r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); if (r) return r; goto retry; } map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); if (!map) { /* avoid deadlock with fs/namespace.c:do_mount() */ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; } r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); if (r) goto out_unlock; set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); clear_bit(DMF_POST_SUSPENDING, &md->flags); out_unlock: mutex_unlock(&md->suspend_lock); return r; } static int __dm_resume(struct mapped_device *md, struct dm_table *map) { if (map) { int r = dm_table_resume_targets(map); if (r) return r; } dm_queue_flush(md); /* * Flushing deferred I/Os must be done after targets are resumed * so that mapping of targets can work correctly. * Request-based dm is queueing the deferred I/Os in its request_queue. */ if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); return 0; } int dm_resume(struct mapped_device *md) { int r; struct dm_table *map = NULL; retry: r = -EINVAL; mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); if (!dm_suspended_md(md)) goto out; if (dm_suspended_internally_md(md)) { /* already internally suspended, wait for internal resume */ mutex_unlock(&md->suspend_lock); r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); if (r) return r; goto retry; } map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); if (!map || !dm_table_get_size(map)) goto out; r = __dm_resume(md, map); if (r) goto out; clear_bit(DMF_SUSPENDED, &md->flags); out: mutex_unlock(&md->suspend_lock); return r; } /* * Internal suspend/resume works like userspace-driven suspend. It waits * until all bios finish and prevents issuing new bios to the target drivers. * It may be used only from the kernel. */ static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend_flags) { struct dm_table *map = NULL; lockdep_assert_held(&md->suspend_lock); if (md->internal_suspend_count++) return; /* nested internal suspend */ if (dm_suspended_md(md)) { set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); return; /* nest suspend */ } map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); /* * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend * would require changing .presuspend to return an error -- avoid this * until there is a need for more elaborate variants of internal suspend. */ (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, DMF_SUSPENDED_INTERNALLY); set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); clear_bit(DMF_POST_SUSPENDING, &md->flags); } static void __dm_internal_resume(struct mapped_device *md) { int r; struct dm_table *map; BUG_ON(!md->internal_suspend_count); if (--md->internal_suspend_count) return; /* resume from nested internal suspend */ if (dm_suspended_md(md)) goto done; /* resume from nested suspend */ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); r = __dm_resume(md, map); if (r) { /* * If a preresume method of some target failed, we are in a * tricky situation. We can't return an error to the caller. We * can't fake success because then the "resume" and * "postsuspend" methods would not be paired correctly, and it * would break various targets, for example it would cause list * corruption in the "origin" target. * * So, we fake normal suspend here, to make sure that the * "resume" and "postsuspend" methods will be paired correctly. */ DMERR("Preresume method failed: %d", r); set_bit(DMF_SUSPENDED, &md->flags); } done: clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); smp_mb__after_atomic(); wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); } void dm_internal_suspend_noflush(struct mapped_device *md) { mutex_lock(&md->suspend_lock); __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); mutex_unlock(&md->suspend_lock); } EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); void dm_internal_resume(struct mapped_device *md) { mutex_lock(&md->suspend_lock); __dm_internal_resume(md); mutex_unlock(&md->suspend_lock); } EXPORT_SYMBOL_GPL(dm_internal_resume); /* * Fast variants of internal suspend/resume hold md->suspend_lock, * which prevents interaction with userspace-driven suspend. */ void dm_internal_suspend_fast(struct mapped_device *md) { mutex_lock(&md->suspend_lock); if (dm_suspended_md(md) || dm_suspended_internally_md(md)) return; set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); synchronize_srcu(&md->io_barrier); flush_workqueue(md->wq); dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); void dm_internal_resume_fast(struct mapped_device *md) { if (dm_suspended_md(md) || dm_suspended_internally_md(md)) goto done; dm_queue_flush(md); done: mutex_unlock(&md->suspend_lock); } EXPORT_SYMBOL_GPL(dm_internal_resume_fast); /* *--------------------------------------------------------------- * Event notification. *--------------------------------------------------------------- */ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned int cookie, bool need_resize_uevent) { int r; unsigned int noio_flag; char udev_cookie[DM_COOKIE_LENGTH]; char *envp[3] = { NULL, NULL, NULL }; char **envpp = envp; if (cookie) { snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", DM_COOKIE_ENV_VAR_NAME, cookie); *envpp++ = udev_cookie; } if (need_resize_uevent) { *envpp++ = "RESIZE=1"; } noio_flag = memalloc_noio_save(); r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); memalloc_noio_restore(noio_flag); return r; } uint32_t dm_next_uevent_seq(struct mapped_device *md) { return atomic_add_return(1, &md->uevent_seq); } uint32_t dm_get_event_nr(struct mapped_device *md) { return atomic_read(&md->event_nr); } int dm_wait_event(struct mapped_device *md, int event_nr) { return wait_event_interruptible(md->eventq, (event_nr != atomic_read(&md->event_nr))); } void dm_uevent_add(struct mapped_device *md, struct list_head *elist) { unsigned long flags; spin_lock_irqsave(&md->uevent_lock, flags); list_add(elist, &md->uevent_list); spin_unlock_irqrestore(&md->uevent_lock, flags); } /* * The gendisk is only valid as long as you have a reference * count on 'md'. */ struct gendisk *dm_disk(struct mapped_device *md) { return md->disk; } EXPORT_SYMBOL_GPL(dm_disk); struct kobject *dm_kobject(struct mapped_device *md) { return &md->kobj_holder.kobj; } struct mapped_device *dm_get_from_kobject(struct kobject *kobj) { struct mapped_device *md; md = container_of(kobj, struct mapped_device, kobj_holder.kobj); spin_lock(&_minor_lock); if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { md = NULL; goto out; } dm_get(md); out: spin_unlock(&_minor_lock); return md; } int dm_suspended_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED, &md->flags); } static int dm_post_suspending_md(struct mapped_device *md) { return test_bit(DMF_POST_SUSPENDING, &md->flags); } int dm_suspended_internally_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); } int dm_test_deferred_remove_flag(struct mapped_device *md) { return test_bit(DMF_DEFERRED_REMOVE, &md->flags); } int dm_suspended(struct dm_target *ti) { return dm_suspended_md(ti->table->md); } EXPORT_SYMBOL_GPL(dm_suspended); int dm_post_suspending(struct dm_target *ti) { return dm_post_suspending_md(ti->table->md); } EXPORT_SYMBOL_GPL(dm_post_suspending); int dm_noflush_suspending(struct dm_target *ti) { return __noflush_suspending(ti->table->md); } EXPORT_SYMBOL_GPL(dm_noflush_suspending); void dm_free_md_mempools(struct dm_md_mempools *pools) { if (!pools) return; bioset_exit(&pools->bs); bioset_exit(&pools->io_bs); kfree(pools); } struct dm_blkdev_id { u8 *id; enum blk_unique_id type; }; static int __dm_get_unique_id(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_blkdev_id *dm_id = data; const struct block_device_operations *fops = dev->bdev->bd_disk->fops; if (!fops->get_unique_id) return 0; return fops->get_unique_id(dev->bdev->bd_disk, dm_id->id, dm_id->type); } /* * Allow access to get_unique_id() for the first device returning a * non-zero result. Reasonable use expects all devices to have the * same unique id. */ static int dm_blk_get_unique_id(struct gendisk *disk, u8 *id, enum blk_unique_id type) { struct mapped_device *md = disk->private_data; struct dm_table *table; struct dm_target *ti; int ret = 0, srcu_idx; struct dm_blkdev_id dm_id = { .id = id, .type = type, }; table = dm_get_live_table(md, &srcu_idx); if (!table || !dm_table_get_size(table)) goto out; /* We only support devices that have a single target */ if (table->num_targets != 1) goto out; ti = dm_table_get_target(table, 0); if (!ti->type->iterate_devices) goto out; ret = ti->type->iterate_devices(ti, __dm_get_unique_id, &dm_id); out: dm_put_live_table(md, srcu_idx); return ret; } struct dm_pr { u64 old_key; u64 new_key; u32 flags; bool abort; bool fail_early; int ret; enum pr_type type; struct pr_keys *read_keys; struct pr_held_reservation *rsv; }; static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, struct dm_pr *pr) { struct mapped_device *md = bdev->bd_disk->private_data; struct dm_table *table; struct dm_target *ti; int ret = -ENOTTY, srcu_idx; table = dm_get_live_table(md, &srcu_idx); if (!table || !dm_table_get_size(table)) goto out; /* We only support devices that have a single target */ if (table->num_targets != 1) goto out; ti = dm_table_get_target(table, 0); if (dm_suspended_md(md)) { ret = -EAGAIN; goto out; } ret = -EINVAL; if (!ti->type->iterate_devices) goto out; ti->type->iterate_devices(ti, fn, pr); ret = 0; out: dm_put_live_table(md, srcu_idx); return ret; } /* * For register / unregister we need to manually call out to every path. */ static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; int ret; if (!ops || !ops->pr_register) { pr->ret = -EOPNOTSUPP; return -1; } ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); if (!ret) return 0; if (!pr->ret) pr->ret = ret; if (pr->fail_early) return -1; return 0; } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { struct dm_pr pr = { .old_key = old_key, .new_key = new_key, .flags = flags, .fail_early = true, .ret = 0, }; int ret; ret = dm_call_pr(bdev, __dm_pr_register, &pr); if (ret) { /* Didn't even get to register a path */ return ret; } if (!pr.ret) return 0; ret = pr.ret; if (!new_key) return ret; /* unregister all paths if we failed to register any path */ pr.old_key = new_key; pr.new_key = 0; pr.flags = 0; pr.fail_early = false; (void) dm_call_pr(bdev, __dm_pr_register, &pr); return ret; } static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; if (!ops || !ops->pr_reserve) { pr->ret = -EOPNOTSUPP; return -1; } pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags); if (!pr->ret) return -1; return 0; } static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { struct dm_pr pr = { .old_key = key, .flags = flags, .type = type, .fail_early = false, .ret = 0, }; int ret; ret = dm_call_pr(bdev, __dm_pr_reserve, &pr); if (ret) return ret; return pr.ret; } /* * If there is a non-All Registrants type of reservation, the release must be * sent down the holding path. For the cases where there is no reservation or * the path is not the holder the device will also return success, so we must * try each path to make sure we got the correct path. */ static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; if (!ops || !ops->pr_release) { pr->ret = -EOPNOTSUPP; return -1; } pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type); if (pr->ret) return -1; return 0; } static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct dm_pr pr = { .old_key = key, .type = type, .fail_early = false, }; int ret; ret = dm_call_pr(bdev, __dm_pr_release, &pr); if (ret) return ret; return pr.ret; } static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; if (!ops || !ops->pr_preempt) { pr->ret = -EOPNOTSUPP; return -1; } pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type, pr->abort); if (!pr->ret) return -1; return 0; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { struct dm_pr pr = { .new_key = new_key, .old_key = old_key, .type = type, .fail_early = false, }; int ret; ret = dm_call_pr(bdev, __dm_pr_preempt, &pr); if (ret) return ret; return pr.ret; } static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; int r, srcu_idx; bool forward = true; /* Not a real ioctl, but targets must not interpret non-DM ioctls */ r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward); if (r < 0) goto out; WARN_ON_ONCE(!forward); ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) r = ops->pr_clear(bdev, key); else r = -EOPNOTSUPP; out: dm_unprepare_ioctl(md, srcu_idx); return r; } static int __dm_pr_read_keys(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; if (!ops || !ops->pr_read_keys) { pr->ret = -EOPNOTSUPP; return -1; } pr->ret = ops->pr_read_keys(dev->bdev, pr->read_keys); if (!pr->ret) return -1; return 0; } static int dm_pr_read_keys(struct block_device *bdev, struct pr_keys *keys) { struct dm_pr pr = { .read_keys = keys, }; int ret; ret = dm_call_pr(bdev, __dm_pr_read_keys, &pr); if (ret) return ret; return pr.ret; } static int __dm_pr_read_reservation(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; if (!ops || !ops->pr_read_reservation) { pr->ret = -EOPNOTSUPP; return -1; } pr->ret = ops->pr_read_reservation(dev->bdev, pr->rsv); if (!pr->ret) return -1; return 0; } static int dm_pr_read_reservation(struct block_device *bdev, struct pr_held_reservation *rsv) { struct dm_pr pr = { .rsv = rsv, }; int ret; ret = dm_call_pr(bdev, __dm_pr_read_reservation, &pr); if (ret) return ret; return pr.ret; } static const struct pr_ops dm_pr_ops = { .pr_register = dm_pr_register, .pr_reserve = dm_pr_reserve, .pr_release = dm_pr_release, .pr_preempt = dm_pr_preempt, .pr_clear = dm_pr_clear, .pr_read_keys = dm_pr_read_keys, .pr_read_reservation = dm_pr_read_reservation, }; static const struct block_device_operations dm_blk_dops = { .submit_bio = dm_submit_bio, .poll_bio = dm_poll_bio, .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, .report_zones = dm_blk_report_zones, .get_unique_id = dm_blk_get_unique_id, .pr_ops = &dm_pr_ops, .owner = THIS_MODULE }; static const struct block_device_operations dm_rq_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, .get_unique_id = dm_blk_get_unique_id, .pr_ops = &dm_pr_ops, .owner = THIS_MODULE }; static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .zero_page_range = dm_dax_zero_page_range, .recovery_write = dm_dax_recovery_write, }; /* * module hooks */ module_init(dm_init); module_exit(dm_exit); module_param(major, uint, 0); MODULE_PARM_DESC(major, "The major number of the device mapper"); module_param(reserved_bio_based_ios, uint, 0644); MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); module_param(dm_numa_node, int, 0644); MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); module_param(swap_bios, int, 0644); MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@lists.linux.dev>"); MODULE_LICENSE("GPL"); |
| 13803 16116 17 77 74 1392 13 16115 13803 16206 16116 16106 1375 1378 1379 75 61 75 77 77 75 34 34 34 34 34 34 34 77 77 76 44 38 29 28 3 29 44 76 34 2 77 77 77 77 74 77 77 77 74 77 77 77 77 76 76 2 2 74 77 77 15 15 15 15 15 15 15 15 15 15 15 1 1 1 1 1 1 1 1 1 1 1 2 11952 11952 11963 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 | // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <trace/events/cgroup.h> static DEFINE_SPINLOCK(rstat_base_lock); static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); /* * Determines whether a given css can participate in rstat. * css's that are cgroup::self use rstat for base stats. * Other css's associated with a subsystem use rstat only when * they define the ss->css_rstat_flush callback. */ static inline bool css_uses_rstat(struct cgroup_subsys_state *css) { return css_is_self(css) || css->ss->css_rstat_flush != NULL; } static struct css_rstat_cpu *css_rstat_cpu( struct cgroup_subsys_state *css, int cpu) { return per_cpu_ptr(css->rstat_cpu, cpu); } static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( struct cgroup *cgrp, int cpu) { return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); } static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) { if (ss) return &ss->rstat_ss_lock; return &rstat_base_lock; } static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) { if (ss) return per_cpu_ptr(ss->lhead, cpu); return per_cpu_ptr(&rstat_backlog_list, cpu); } /** * css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * * Atomically inserts the css in the ss's llist for the given cpu. This is * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist * will be processed at the flush time to create the update tree. * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a * memory barrier is needed before the call to css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct llist_node *self; /* * Since bpf programs can call this function, prevent access to * uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; lockdep_assert_preemption_disabled(); /* * For archs withnot nmi safe cmpxchg or percpu ops support, ignore * the requests from nmi context. */ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) return; rstatc = css_rstat_cpu(css, cpu); /* * If already on list return. This check is racy and smp_mb() is needed * to pair it with the smp_mb() in css_process_update_tree() if the * guarantee that the updated stats are visible to concurrent flusher is * needed. */ if (llist_on_list(&rstatc->lnode)) return; /* * This function can be renentered by irqs and nmis for the same cgroup * and may try to insert the same per-cpu lnode into the llist. Note * that llist_add() does not protect against such scenarios. In addition * this same per-cpu lnode can be modified through init_llist_node() * from css_rstat_flush() running on a different CPU. * * To protect against such stacked contexts of irqs/nmis, we use the * fact that lnode points to itself when not on a list and then use * try_cmpxchg() to atomically set to NULL to select the winner * which will call llist_add(). The losers can assume the insertion is * successful and the winner will eventually add the per-cpu lnode to * the llist. * * Please note that we can not use this_cpu_cmpxchg() here as on some * archs it is not safe against modifications from multiple CPUs. */ self = &rstatc->lnode; if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL)) return; lhead = ss_lhead_cpu(css->ss, cpu); llist_add(&rstatc->lnode, lhead); } static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); struct cgroup_subsys_state *parent = css->parent; struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup * is already in the tree, all ancestors are. */ if (rstatc->updated_next) break; /* Root has no parent to link it to, but mark it busy */ if (!parent) { rstatc->updated_next = css; break; } prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = css; css = parent; } } static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) { struct llist_head *lhead = ss_lhead_cpu(ss, cpu); struct llist_node *lnode; while ((lnode = llist_del_first_init(lhead))) { struct css_rstat_cpu *rstatc; /* * smp_mb() is needed here (more specifically in between * init_llist_node() and per-cpu stats flushing) if the * guarantee is required by a rstat user where etiher the * updater should add itself on the lockless list or the * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before * css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add * smp_mb() here. */ rstatc = container_of(lnode, struct css_rstat_cpu, lnode); __css_process_update_tree(rstatc->owner, cpu); } } /** * css_rstat_push_children - push children css's into the given list * @head: current head of the list (= subtree root) * @child: first child of the root * @cpu: target cpu * Return: A new singly linked list of css's to be flushed * * Iteratively traverse down the css_rstat_cpu updated tree level by * level and push all the parents first before their next level children * into a singly linked list via the rstat_flush_next pointer built from the * tail backward like "pushing" css's into a stack. The root is pushed by * the caller. */ static struct cgroup_subsys_state *css_rstat_push_children( struct cgroup_subsys_state *head, struct cgroup_subsys_state *child, int cpu) { struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ struct cgroup_subsys_state *parent, *grandchild; struct css_rstat_cpu *crstatc; child->rstat_flush_next = NULL; /* * The subsystem rstat lock must be held for the whole duration from * here as the rstat_flush_next list is being constructed to when * it is consumed later in css_rstat_flush(). */ lockdep_assert_held(ss_rstat_lock(head->ss)); /* * Notation: -> updated_next pointer * => rstat_flush_next pointer * * Assuming the following sample updated_children lists: * P: C1 -> C2 -> P * C1: G11 -> G12 -> C1 * C2: G21 -> G22 -> C2 * * After 1st iteration: * head => C2 => C1 => NULL * ghead => G21 => G11 => NULL * * After 2nd iteration: * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL */ next_level: while (cnext) { child = cnext; cnext = child->rstat_flush_next; parent = child->parent; /* updated_next is parent cgroup terminated if !NULL */ while (child != parent) { child->rstat_flush_next = head; head = child; crstatc = css_rstat_cpu(child, cpu); grandchild = crstatc->updated_children; if (grandchild != child) { /* Push the grand child to the next level */ crstatc->updated_children = child; grandchild->rstat_flush_next = ghead; ghead = grandchild; } child = crstatc->updated_next; crstatc->updated_next = NULL; } } if (ghead) { cnext = ghead; ghead = NULL; goto next_level; } return head; } /** * css_rstat_updated_list - build a list of updated css's to be flushed * @root: root of the css subtree to traverse * @cpu: target cpu * Return: A singly linked list of css's to be flushed * * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, * each returned css is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, the child is before its parent in * the list. * * Note that updated_children is self terminated and points to a list of * child css's if not empty. Whereas updated_next is like a sibling link * within the children list and terminated by the parent css. An exception * here is the css root whose updated_next can be self terminated. */ static struct cgroup_subsys_state *css_rstat_updated_list( struct cgroup_subsys_state *root, int cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) return NULL; /* * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. */ parent = root->parent; if (parent) { struct css_rstat_cpu *prstatc; struct cgroup_subsys_state **nextp; prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (*nextp != root) { struct css_rstat_cpu *nrstatc; nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } *nextp = rstatc->updated_next; } rstatc->updated_next = NULL; /* Push @root to the list first before pushing the children */ head = root; root->rstat_flush_next = NULL; child = rstatc->updated_children; rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. * Together with providing bpf kfuncs for css_rstat_updated() and * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away * the function. A global noinline declaration will keep the definition, but may * optimize away the callsite. Therefore, __weak is needed to ensure that the * call is still emitted, by telling the compiler that we don't know what the * function might eventually be. */ __bpf_hook_start(); __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, struct cgroup *parent, int cpu) { } __bpf_hook_end(); /* * Helper functions for locking. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @cpu_in_loop indicate lock * was released and re-taken when collection data from the CPUs. The * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ static inline void __css_rstat_lock(struct cgroup_subsys_state *css, int cpu_in_loop) __acquires(ss_rstat_lock(css->ss)) { struct cgroup *cgrp = css->cgroup; spinlock_t *lock; bool contended; lock = ss_rstat_lock(css->ss); contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, int cpu_in_loop) __releases(ss_rstat_lock(css->ss)) { struct cgroup *cgrp = css->cgroup; spinlock_t *lock; lock = ss_rstat_lock(css->ss); trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); spin_unlock_irq(lock); } /** * css_rstat_flush - flush stats in @css's rstat subtree * @css: target cgroup subsystem state * * Collect all per-cpu stats in @css's subtree into the global counters * and propagate them upwards. After this function returns, all rstat * nodes in the subtree have up-to-date ->stat. * * This also gets all rstat nodes in the subtree including @css off the * ->updated_children lists. * * This function may block. */ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { int cpu; bool is_self = css_is_self(css); /* * Since bpf programs can call this function, prevent access to * uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; might_sleep(); for_each_possible_cpu(cpu) { struct cgroup_subsys_state *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ __css_rstat_lock(css, cpu); pos = css_rstat_updated_list(css, cpu); for (; pos; pos = pos->rstat_flush_next) { if (is_self) { cgroup_base_stat_flush(pos->cgroup, cpu); bpf_rstat_flush(pos->cgroup, cgroup_parent(pos->cgroup), cpu); } else pos->ss->css_rstat_flush(pos, cpu); } __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } int css_rstat_init(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; int cpu; bool is_self = css_is_self(css); if (is_self) { /* the root cgrp has rstat_base_cpu preallocated */ if (!cgrp->rstat_base_cpu) { cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); if (!cgrp->rstat_base_cpu) return -ENOMEM; } } else if (css->ss->css_rstat_flush == NULL) return 0; /* the root cgrp's self css has rstat_cpu preallocated */ if (!css->rstat_cpu) { css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); if (!css->rstat_cpu) { if (is_self) free_percpu(cgrp->rstat_base_cpu); return -ENOMEM; } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); rstatc->owner = rstatc->updated_children = css; init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); u64_stats_init(&rstatbc->bsync); } } return 0; } void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; if (!css_uses_rstat(css)) return; if (!css->rstat_cpu) return; css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } if (css_is_self(css)) { struct cgroup *cgrp = css->cgroup; free_percpu(cgrp->rstat_base_cpu); cgrp->rstat_base_cpu = NULL; } free_percpu(css->rstat_cpu); css->rstat_cpu = NULL; } /** * ss_rstat_init - subsystem-specific rstat initialization * @ss: target subsystem * * If @ss is NULL, the static locks associated with the base stats * are initialized. If @ss is non-NULL, the subsystem-specific locks * are initialized. */ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; if (ss) { ss->lhead = alloc_percpu(struct llist_head); if (!ss->lhead) return -ENOMEM; } spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) init_llist_head(ss_lhead_cpu(ss, cpu)); return 0; } /* * Functions for cgroup basic resource statistics implemented on top of * rstat. */ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif dst_bstat->ntime += src_bstat->ntime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif dst_bstat->ntime -= src_bstat->ntime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ if (!parent) return; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatbc->bsync); delta = rstatbc->bstat; } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatbc->last_bstat, &delta); cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); delta = rstatbc->subtree_bstat; prstatbc = cgroup_rstat_base_cpu(parent, cpu); cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_base_cpu *rstatbc; rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* * compute the cputime for the root cgroup by getting the per cpu data * at a global level, then categorizing the fields in a manner consistent * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) { struct task_cputime *cputime = &bstat->cputime; int i; memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u64 user = 0; u64 sys = 0; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; user += cpustat[CPUTIME_NICE]; cputime->utime += user; sys += cpustat[CPUTIME_SYSTEM]; sys += cpustat[CPUTIME_IRQ]; sys += cpustat[CPUTIME_SOFTIRQ]; cputime->stime += sys; cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif bstat->ntime += cpustat[CPUTIME_NICE]; } } static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) { #ifdef CONFIG_SCHED_CORE u64 forceidle_time = bstat->forceidle_sum; do_div(forceidle_time, NSEC_PER_USEC); seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif } void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { css_rstat_flush(&cgrp->self); __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); do_div(bstat.cputime.utime, NSEC_PER_USEC); do_div(bstat.cputime.stime, NSEC_PER_USEC); do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", bstat.cputime.sum_exec_runtime, bstat.cputime.utime, bstat.cputime.stime, bstat.ntime); cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, css_rstat_updated) BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_rstat_kfunc_ids, }; static int __init bpf_rstat_kfunc_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_rstat_kfunc_set); } late_initcall(bpf_rstat_kfunc_init); |
| 14 17 16 14 14 14 3 17 1 1 1 435 435 435 439 460 460 460 460 460 460 460 460 460 460 460 460 460 438 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder. * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * * Fixes: * Alan Cox : Merged and made usable non modular (its so tiny its silly as * a module taking up 2 pages). * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) * to keep ip_forward happy. * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL * David Woodhouse : Perform some basic ICMP handling. * IPIP Routing without decapsulation. * Carlos Picoto : GRE over IP support * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. * I do not want to merge them together. */ /* tunnel.c: an IP tunnel driver The purpose of this driver is to provide an IP tunnel through which you can tunnel network traffic transparently across subnets. This was written by looking at Nick Holloway's dummy driver Thanks for the great code! -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 Minor tweaks: Cleaned up the code a little and added some pre-1.3.0 tweaks. dev->hard_header/hard_header_len changed to use no headers. Comments/bracketing tweaked. Made the tunnels use dev->name not tunnel: when error reporting. Added tx_dropped stat -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 Reworked: Changed to tunnel to destination gateway in addition to the tunnel's pointopoint address Almost completely rewritten Note: There is currently no firewall or ICMP handling done. -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 */ /* Things I wish I had known when writing the tunnel driver: When the tunnel_xmit() function is called, the skb contains the packet to be sent (plus a great deal of extra info), and dev contains the tunnel device that _we_ are. When we are passed a packet, we are expected to fill in the source address with our source IP address. What is the proper way to allocate, copy and free a buffer? After you allocate it, it is a "0 length" chunk of memory starting at zero. If you want to add headers to the buffer later, you'll have to call "skb_reserve(skb, amount)" with the amount of memory you want reserved. Then, you call "skb_put(skb, amount)" with the amount of space you want in the buffer. skb_put() returns a pointer to the top (#0) of that buffer. skb->len is set to the amount of space you have "allocated" with skb_put(). You can then write up to skb->len bytes to that buffer. If you need more, you can call skb_put() again with the additional amount of space you need. You can find out how much more space you can allocate by calling "skb_tailroom(skb)". Now, to add header space, call "skb_push(skb, header_len)". This creates space at the beginning of the buffer and returns a pointer to this new space. If later you need to strip a header from a buffer, call "skb_pull(skb, header_len)". skb_headroom() will return how much space is left at the top of the buffer (before the main data). Remember, this headroom space must be reserved before the skb_put() function is called. */ /* This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!t) { err = -ENOENT; goto out; } switch (type) { case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ goto out; default: /* All others are translated to HOST_UNREACH. * rfc2003 contains "deep thoughts" about NET_UNREACH, * I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) goto out; break; case ICMP_REDIRECT: break; default: goto out; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, net, t->parms.link, iph->protocol); goto out; } if (t->parms.iph.daddr == 0) { err = -ENOENT; goto out; } if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { ip_tunnel_flags_zero(flags); tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); } skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } return -1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; u8 ipproto; if (!pskb_inet_may_pull(skb)) goto tx_error; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; #endif default: goto tx_error; } if (tiph->protocol != ipproto && tiph->protocol != 0) goto tx_error; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) { switch (ipproto) { case 0: case IPPROTO_IPIP: #if IS_ENABLED(CONFIG_MPLS) case IPPROTO_MPLS: #endif return true; } return false; } static int ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } p->i_key = p->o_key = 0; ip_tunnel_flags_zero(p->i_flags); ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct ip_tunnel *tunnel = netdev_priv(ctx->dev); const struct iphdr *tiph = &tunnel->parms.iph; struct rtable *rt; rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return PTR_ERR(rt); path->type = DEV_PATH_TUN; path->tun.src_v4.s_addr = tiph->saddr; path->tun.dst_v4.s_addr = tiph->daddr; path->tun.l3_proto = IPPROTO_IPIP; path->dev = ctx->dev; ctx->dev = rt->dst.dev; ip_rt_put(rt); return 0; } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, .ndo_fill_forward_path = ipip_fill_forward_path, }; #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); tunnel->tun_hlen = 0; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; return ip_tunnel_init(dev); } static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0) return -EINVAL; return 0; } static void ipip_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPIP; parms->iph.ihl = 5; *collect_md = false; if (!data) return; ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ipip_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &collect_md, &fwmark); if (collect_md) return -EINVAL; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipip_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (tunnel->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { .kind = "ipip", .maxtype = IFLA_IPTUN_MAX, .policy = ipip_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip_tunnel_setup, .validate = ipip_tunnel_validate, .newlink = ipip_newlink, .changelink = ipip_changelink, .dellink = ip_tunnel_dellink, .get_size = ipip_get_size, .fill_info = ipip_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip_err, .priority = 1, }; #endif static int __net_init ipip_init_net(struct net *net) { return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } static void __net_exit ipip_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; static int __init ipip_init(void) { int err; pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_ipip_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_mplsip_failed; } #endif err = rtnl_link_register(&ipip_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mplsip_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: unregister_pernet_device(&ipip_net_ops); goto out; } static void __exit ipip_fini(void) { rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); #if IS_ENABLED(CONFIG_MPLS) if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS)) pr_info("%s: can't deregister tunnel\n", __func__); #endif unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); module_exit(ipip_fini); MODULE_DESCRIPTION("IP/IP protocol decoder library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); |
| 449 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BIO_INTEGRITY_H #define _LINUX_BIO_INTEGRITY_H #include <linux/bio.h> enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ struct bio_vec *bip_vec; }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) #define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip) return bip->bip_flags & flag; return false; } static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; } static inline void bip_set_seed(struct bio_integrity_payload *bip, sector_t seed) { bip->bip_iter.bi_sector = seed; } void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); void bio_integrity_prep(struct bio *bio, unsigned int action); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { return NULL; } static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) { return -EINVAL; } static inline void bio_integrity_unmap_user(struct bio *bio) { } static inline void bio_integrity_prep(struct bio *bio, unsigned int action) { } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { return 0; } static inline void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { } static inline void bio_integrity_trim(struct bio *bio) { } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; } static inline struct bio_integrity_payload * bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } static inline int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); void bio_integrity_setup_default(struct bio *bio); unsigned int fs_bio_integrity_alloc(struct bio *bio); void fs_bio_integrity_free(struct bio *bio); void fs_bio_integrity_generate(struct bio *bio); int fs_bio_integrity_verify(struct bio *bio, sector_t sector, unsigned int size); #endif /* _LINUX_BIO_INTEGRITY_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/svc_xprt.h * * RPC server transport I/O */ #ifndef SUNRPC_SVC_XPRT_H #define SUNRPC_SVC_XPRT_H #include <linux/sunrpc/svc.h> struct module; struct svc_xprt_ops { struct svc_xprt *(*xpo_create)(struct svc_serv *, struct net *net, struct sockaddr *, int, int); struct svc_xprt *(*xpo_accept)(struct svc_xprt *); int (*xpo_has_wspace)(struct svc_xprt *); int (*xpo_recvfrom)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); int (*xpo_result_payload)(struct svc_rqst *, unsigned int, unsigned int); void (*xpo_release_ctxt)(struct svc_xprt *xprt, void *ctxt); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); void (*xpo_kill_temp_xprt)(struct svc_xprt *); void (*xpo_handshake)(struct svc_xprt *xprt); }; struct svc_xprt_class { const char *xcl_name; struct module *xcl_owner; const struct svc_xprt_ops *xcl_ops; struct list_head xcl_list; u32 xcl_max_payload; int xcl_ident; }; /* * This is embedded in an object that wants a callback before deleting * an xprt; intended for use by NFSv4.1, which needs to know when a * client's tcp connection (and hence possibly a backchannel) goes away. */ struct svc_xpt_user { struct list_head list; void (*callback)(struct svc_xpt_user *); }; struct svc_xprt { struct svc_xprt_class *xpt_class; const struct svc_xprt_ops *xpt_ops; struct kref xpt_ref; ktime_t xpt_qtime; struct list_head xpt_list; struct lwq_node xpt_ready; unsigned long xpt_flags; struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ atomic_t xpt_nr_rqsts; /* Number of requests */ struct mutex xpt_mutex; /* to serialize sending data */ spinlock_t xpt_lock; /* protects sk_deferred * and xpt_auth_cache */ void *xpt_auth_cache;/* auth cache */ struct list_head xpt_deferred; /* deferred requests that need * to be revisted */ struct sockaddr_storage xpt_local; /* local address */ size_t xpt_locallen; /* length of address */ struct sockaddr_storage xpt_remote; /* remote peer's address */ size_t xpt_remotelen; /* length of address */ char xpt_remotebuf[INET6_ADDRSTRLEN + 10]; struct list_head xpt_users; /* callbacks on free */ struct net *xpt_net; netns_tracker ns_tracker; const struct cred *xpt_cred; struct rpc_xprt *xpt_bc_xprt; /* NFSv4.1 backchannel */ struct rpc_xprt_switch *xpt_bc_xps; /* NFSv4.1 backchannel */ }; /* flag bits for xpt_flags */ enum { XPT_BUSY, /* enqueued/receiving */ XPT_CONN, /* conn pending */ XPT_CLOSE, /* dead or dying */ XPT_DATA, /* data pending */ XPT_TEMP, /* connected transport */ XPT_DEAD, /* transport closed */ XPT_CHNGBUF, /* need to change snd/rcv buf sizes */ XPT_DEFERRED, /* deferred request pending */ XPT_OLD, /* used for xprt aging mark+sweep */ XPT_LISTENER, /* listening endpoint */ XPT_CACHE_AUTH, /* cache auth info */ XPT_LOCAL, /* connection from loopback interface */ XPT_KILL_TEMP, /* call xpo_kill_temp_xprt before closing */ XPT_CONG_CTRL, /* has congestion control */ XPT_HANDSHAKE, /* xprt requests a handshake */ XPT_TLS_SESSION, /* transport-layer security established */ XPT_PEER_AUTH, /* peer has been authenticated */ XPT_PEER_VALID, /* peer has presented a filehandle that * it has access to. It is NOT counted * in ->sv_tmpcnt. */ XPT_RPCB_UNREG, /* transport that needs unregistering * with rpcbind (TCP, UDP) on destroy */ }; /* * Maximum number of "tmp" connections - those without XPT_PEER_VALID - * permitted on any service. */ #define XPT_MAX_TMP_CONN 64 static inline void svc_xprt_set_valid(struct svc_xprt *xpt) { if (test_bit(XPT_TEMP, &xpt->xpt_flags) && !test_and_set_bit(XPT_PEER_VALID, &xpt->xpt_flags)) { struct svc_serv *serv = xpt->xpt_server; spin_lock(&serv->sv_lock); serv->sv_tmpcnt -= 1; spin_unlock(&serv->sv_lock); } } static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u) { spin_lock(&xpt->xpt_lock); list_del_init(&u->list); spin_unlock(&xpt->xpt_lock); } static inline int register_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u) { spin_lock(&xpt->xpt_lock); if (test_bit(XPT_CLOSE, &xpt->xpt_flags)) { /* * The connection is about to be deleted soon (or, * worse, may already be deleted--in which case we've * already notified the xpt_users). */ spin_unlock(&xpt->xpt_lock); return -ENOTCONN; } list_add(&u->list, &xpt->xpt_users); spin_unlock(&xpt->xpt_lock); return 0; } static inline bool svc_xprt_is_dead(const struct svc_xprt *xprt) { return (test_bit(XPT_DEAD, &xprt->xpt_flags) != 0) || (test_bit(XPT_CLOSE, &xprt->xpt_flags) != 0); } int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *, struct svc_serv *); int svc_xprt_create_from_sa(struct svc_serv *serv, const char *xprt_name, struct net *net, struct sockaddr *sap, int flags, const struct cred *cred); int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt); void svc_xprt_close(struct svc_xprt *xprt); int svc_port_is_privileged(struct sockaddr *sin); int svc_print_xprts(char *buf, int maxlen); struct svc_xprt *svc_find_listener(struct svc_serv *serv, const char *xcl_name, struct net *net, const struct sockaddr *sa); struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, struct net *net, const sa_family_t af, const unsigned short port); int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen); void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *xprt); void svc_age_temp_xprts_now(struct svc_serv *, struct sockaddr *); void svc_xprt_deferred_close(struct svc_xprt *xprt); static inline void svc_xprt_get(struct svc_xprt *xprt) { kref_get(&xprt->xpt_ref); } static inline void svc_xprt_set_local(struct svc_xprt *xprt, const struct sockaddr *sa, const size_t salen) { memcpy(&xprt->xpt_local, sa, salen); xprt->xpt_locallen = salen; } static inline void svc_xprt_set_remote(struct svc_xprt *xprt, const struct sockaddr *sa, const size_t salen) { memcpy(&xprt->xpt_remote, sa, salen); xprt->xpt_remotelen = salen; snprintf(xprt->xpt_remotebuf, sizeof(xprt->xpt_remotebuf) - 1, "%pISpc", sa); } static inline unsigned short svc_addr_port(const struct sockaddr *sa) { const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa; switch (sa->sa_family) { case AF_INET: return ntohs(sin->sin_port); case AF_INET6: return ntohs(sin6->sin6_port); } return 0; } static inline size_t svc_addr_len(const struct sockaddr *sa) { switch (sa->sa_family) { case AF_INET: return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); } BUG(); } static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt) { return svc_addr_port((const struct sockaddr *)&xprt->xpt_local); } static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt) { return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote); } static inline char *__svc_print_addr(const struct sockaddr *addr, char *buf, const size_t len) { const struct sockaddr_in *sin = (const struct sockaddr_in *)addr; const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr; switch (addr->sa_family) { case AF_INET: snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr, ntohs(sin->sin_port)); break; case AF_INET6: snprintf(buf, len, "%pI6, port=%u", &sin6->sin6_addr, ntohs(sin6->sin6_port)); break; default: snprintf(buf, len, "unknown address type: %d", addr->sa_family); break; } return buf; } #endif /* SUNRPC_SVC_XPRT_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * * Common inline functions needed for truncate support */ /* * Truncate blocks that were not used by write. We have to truncate the * pagecache as well so that corresponding buffers get properly unmapped. */ static inline void ext4_truncate_failed_write(struct inode *inode) { struct address_space *mapping = inode->i_mapping; /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ filemap_invalidate_lock(mapping); truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); filemap_invalidate_unlock(mapping); } /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) { ext4_lblk_t needed; needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); /* Give ourselves just enough room to cope with inodes in which * i_blocks is corrupt: we've seen disk corruptions in the past * which resulted in random data in an inode which looked enough * like a regular file for ext4 to try to delete it. Things * will go a bit crazy if that happens, but at least we should * try not to panic the whole kernel. */ if (needed < 2) needed = 2; /* But we need to bound the transaction so we don't overflow the * journal. */ if (needed > EXT4_MAX_TRANS_DATA) needed = EXT4_MAX_TRANS_DATA; return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } |
| 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: exregion - ACPI default op_region (address space) handlers * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acinterp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exregion") /******************************************************************************* * * FUNCTION: acpi_ex_system_memory_space_handler * * PARAMETERS: function - Read or Write operation * address - Where in the space to read or write * bit_width - Field width in bits (8, 16, or 32) * value - Pointer to in or out value * handler_context - Pointer to Handler's context * region_context - Pointer to context specific to the * accessed region * * RETURN: Status * * DESCRIPTION: Handler for the System Memory address space (Op Region) * ******************************************************************************/ acpi_status acpi_ex_system_memory_space_handler(u32 function, acpi_physical_address address, u32 bit_width, u64 *value, void *handler_context, void *region_context) { acpi_status status = AE_OK; void *logical_addr_ptr = NULL; struct acpi_mem_space_context *mem_info = region_context; struct acpi_mem_mapping *mm = mem_info->cur_mm; u32 length; acpi_size map_length; #ifdef ACPI_MISALIGNMENT_NOT_SUPPORTED u32 remainder; #endif ACPI_FUNCTION_TRACE(ex_system_memory_space_handler); /* Validate and translate the bit width */ switch (bit_width) { case 8: length = 1; break; case 16: length = 2; break; case 32: length = 4; break; case 64: length = 8; break; default: ACPI_ERROR((AE_INFO, "Invalid SystemMemory width %u", bit_width)); return_ACPI_STATUS(AE_AML_OPERAND_VALUE); } #ifdef ACPI_MISALIGNMENT_NOT_SUPPORTED /* * Hardware does not support non-aligned data transfers, we must verify * the request. */ (void)acpi_ut_short_divide((u64) address, length, NULL, &remainder); if (remainder != 0) { return_ACPI_STATUS(AE_AML_ALIGNMENT); } #endif /* * Does the request fit into the cached memory mapping? * Is 1) Address below the current mapping? OR * 2) Address beyond the current mapping? */ if (!mm || (address < mm->physical_address) || ((u64) address + length > (u64) mm->physical_address + mm->length)) { /* * The request cannot be resolved by the current memory mapping. * * Look for an existing saved mapping covering the address range * at hand. If found, save it as the current one and carry out * the access. */ for (mm = mem_info->first_mm; mm; mm = mm->next_mm) { if (mm == mem_info->cur_mm) continue; if (address < mm->physical_address) continue; if ((u64) address + length > (u64) mm->physical_address + mm->length) continue; mem_info->cur_mm = mm; goto access; } /* Create a new mappings list entry */ mm = ACPI_ALLOCATE_ZEROED(sizeof(*mm)); if (!mm) { ACPI_ERROR((AE_INFO, "Unable to save memory mapping at 0x%8.8X%8.8X, size %u", ACPI_FORMAT_UINT64(address), length)); return_ACPI_STATUS(AE_NO_MEMORY); } /* * October 2009: Attempt to map from the requested address to the * end of the region. However, we will never map more than one * page, nor will we cross a page boundary. */ map_length = (acpi_size) ((mem_info->address + mem_info->length) - address); if (map_length > ACPI_DEFAULT_PAGE_SIZE) map_length = ACPI_DEFAULT_PAGE_SIZE; /* Create a new mapping starting at the address given */ logical_addr_ptr = acpi_os_map_memory(address, map_length); if (!logical_addr_ptr) { ACPI_ERROR((AE_INFO, "Could not map memory at 0x%8.8X%8.8X, size %u", ACPI_FORMAT_UINT64(address), (u32)map_length)); ACPI_FREE(mm); return_ACPI_STATUS(AE_NO_MEMORY); } /* Save the physical address and mapping size */ mm->logical_address = logical_addr_ptr; mm->physical_address = address; mm->length = map_length; /* * Add the new entry to the mappigs list and save it as the * current mapping. */ mm->next_mm = mem_info->first_mm; mem_info->first_mm = mm; mem_info->cur_mm = mm; } access: /* * Generate a logical pointer corresponding to the address we want to * access */ logical_addr_ptr = mm->logical_address + ((u64) address - (u64) mm->physical_address); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "System-Memory (width %u) R/W %u Address=%8.8X%8.8X\n", bit_width, function, ACPI_FORMAT_UINT64(address))); /* * Perform the memory read or write * * Note: For machines that do not support non-aligned transfers, the target * address was checked for alignment above. We do not attempt to break the * transfer up into smaller (byte-size) chunks because the AML specifically * asked for a transfer width that the hardware may require. */ switch (function) { case ACPI_READ: *value = 0; switch (bit_width) { case 8: *value = (u64)ACPI_GET8(logical_addr_ptr); break; case 16: *value = (u64)ACPI_GET16(logical_addr_ptr); break; case 32: *value = (u64)ACPI_GET32(logical_addr_ptr); break; case 64: *value = (u64)ACPI_GET64(logical_addr_ptr); break; default: /* bit_width was already validated */ break; } break; case ACPI_WRITE: switch (bit_width) { case 8: ACPI_SET8(logical_addr_ptr, *value); break; case 16: ACPI_SET16(logical_addr_ptr, *value); break; case 32: ACPI_SET32(logical_addr_ptr, *value); break; case 64: ACPI_SET64(logical_addr_ptr, *value); break; default: /* bit_width was already validated */ break; } break; default: status = AE_BAD_PARAMETER; break; } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ex_system_io_space_handler * * PARAMETERS: function - Read or Write operation * address - Where in the space to read or write * bit_width - Field width in bits (8, 16, or 32) * value - Pointer to in or out value * handler_context - Pointer to Handler's context * region_context - Pointer to context specific to the * accessed region * * RETURN: Status * * DESCRIPTION: Handler for the System IO address space (Op Region) * ******************************************************************************/ acpi_status acpi_ex_system_io_space_handler(u32 function, acpi_physical_address address, u32 bit_width, u64 *value, void *handler_context, void *region_context) { acpi_status status = AE_OK; u32 value32; ACPI_FUNCTION_TRACE(ex_system_io_space_handler); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "System-IO (width %u) R/W %u Address=%8.8X%8.8X\n", bit_width, function, ACPI_FORMAT_UINT64(address))); /* Decode the function parameter */ switch (function) { case ACPI_READ: status = acpi_hw_read_port((acpi_io_address)address, &value32, bit_width); *value = value32; break; case ACPI_WRITE: status = acpi_hw_write_port((acpi_io_address)address, (u32)*value, bit_width); break; default: status = AE_BAD_PARAMETER; break; } return_ACPI_STATUS(status); } #ifdef ACPI_PCI_CONFIGURED /******************************************************************************* * * FUNCTION: acpi_ex_pci_config_space_handler * * PARAMETERS: function - Read or Write operation * address - Where in the space to read or write * bit_width - Field width in bits (8, 16, or 32) * value - Pointer to in or out value * handler_context - Pointer to Handler's context * region_context - Pointer to context specific to the * accessed region * * RETURN: Status * * DESCRIPTION: Handler for the PCI Config address space (Op Region) * ******************************************************************************/ acpi_status acpi_ex_pci_config_space_handler(u32 function, acpi_physical_address address, u32 bit_width, u64 *value, void *handler_context, void *region_context) { acpi_status status = AE_OK; struct acpi_pci_id *pci_id; u16 pci_register; ACPI_FUNCTION_TRACE(ex_pci_config_space_handler); /* * The arguments to acpi_os(Read|Write)pci_configuration are: * * pci_segment is the PCI bus segment range 0-31 * pci_bus is the PCI bus number range 0-255 * pci_device is the PCI device number range 0-31 * pci_function is the PCI device function number * pci_register is the Config space register range 0-255 bytes * * value - input value for write, output address for read * */ pci_id = (struct acpi_pci_id *)region_context; pci_register = (u16) (u32) address; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Pci-Config %u (%u) Seg(%04x) Bus(%04x) " "Dev(%04x) Func(%04x) Reg(%04x)\n", function, bit_width, pci_id->segment, pci_id->bus, pci_id->device, pci_id->function, pci_register)); switch (function) { case ACPI_READ: *value = 0; status = acpi_os_read_pci_configuration(pci_id, pci_register, value, bit_width); break; case ACPI_WRITE: status = acpi_os_write_pci_configuration(pci_id, pci_register, *value, bit_width); break; default: status = AE_BAD_PARAMETER; break; } return_ACPI_STATUS(status); } #endif /******************************************************************************* * * FUNCTION: acpi_ex_cmos_space_handler * * PARAMETERS: function - Read or Write operation * address - Where in the space to read or write * bit_width - Field width in bits (8, 16, or 32) * value - Pointer to in or out value * handler_context - Pointer to Handler's context * region_context - Pointer to context specific to the * accessed region * * RETURN: Status * * DESCRIPTION: Handler for the CMOS address space (Op Region) * ******************************************************************************/ acpi_status acpi_ex_cmos_space_handler(u32 function, acpi_physical_address address, u32 bit_width, u64 *value, void *handler_context, void *region_context) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(ex_cmos_space_handler); return_ACPI_STATUS(status); } #ifdef ACPI_PCI_CONFIGURED /******************************************************************************* * * FUNCTION: acpi_ex_pci_bar_space_handler * * PARAMETERS: function - Read or Write operation * address - Where in the space to read or write * bit_width - Field width in bits (8, 16, or 32) * value - Pointer to in or out value * handler_context - Pointer to Handler's context * region_context - Pointer to context specific to the * accessed region * * RETURN: Status * * DESCRIPTION: Handler for the PCI bar_target address space (Op Region) * ******************************************************************************/ acpi_status acpi_ex_pci_bar_space_handler(u32 function, acpi_physical_address address, u32 bit_width, u64 *value, void *handler_context, void *region_context) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(ex_pci_bar_space_handler); return_ACPI_STATUS(status); } #endif /******************************************************************************* * * FUNCTION: acpi_ex_data_table_space_handler * * PARAMETERS: function - Read or Write operation * address - Where in the space to read or write * bit_width - Field width in bits (8, 16, or 32) * value - Pointer to in or out value * handler_context - Pointer to Handler's context * region_context - Pointer to context specific to the * accessed region * * RETURN: Status * * DESCRIPTION: Handler for the Data Table address space (Op Region) * ******************************************************************************/ acpi_status acpi_ex_data_table_space_handler(u32 function, acpi_physical_address address, u32 bit_width, u64 *value, void *handler_context, void *region_context) { struct acpi_data_table_mapping *mapping; char *pointer; ACPI_FUNCTION_TRACE(ex_data_table_space_handler); mapping = (struct acpi_data_table_mapping *) region_context; pointer = ACPI_CAST_PTR(char, mapping->pointer) + (address - ACPI_PTR_TO_PHYSADDR(mapping->pointer)); /* * Perform the memory read or write. The bit_width was already * validated. */ switch (function) { case ACPI_READ: memcpy(ACPI_CAST_PTR(char, value), pointer, ACPI_DIV_8(bit_width)); break; case ACPI_WRITE: memcpy(pointer, ACPI_CAST_PTR(char, value), ACPI_DIV_8(bit_width)); break; default: return_ACPI_STATUS(AE_BAD_PARAMETER); } return_ACPI_STATUS(AE_OK); } |
| 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 | /* * Copyright (c) 1982, 1986 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _LINUX_QUOTA_ #define _LINUX_QUOTA_ #include <linux/list.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/wait.h> #include <linux/percpu_counter.h> #include <linux/dqblk_xfs.h> #include <linux/dqblk_v1.h> #include <linux/dqblk_v2.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/projid.h> #include <uapi/linux/quota.h> #undef USRQUOTA #undef GRPQUOTA #undef PRJQUOTA enum quota_type { USRQUOTA = 0, /* element used for user quotas */ GRPQUOTA = 1, /* element used for group quotas */ PRJQUOTA = 2, /* element used for project quotas */ }; /* Masks for quota types when used as a bitmask */ #define QTYPE_MASK_USR (1 << USRQUOTA) #define QTYPE_MASK_GRP (1 << GRPQUOTA) #define QTYPE_MASK_PRJ (1 << PRJQUOTA) typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ typedef long long qsize_t; /* Type in which we store sizes */ struct kqid { /* Type in which we store the quota identifier */ union { kuid_t uid; kgid_t gid; kprojid_t projid; }; enum quota_type type; /* USRQUOTA (uid) or GRPQUOTA (gid) or PRJQUOTA (projid) */ }; extern bool qid_eq(struct kqid left, struct kqid right); extern bool qid_lt(struct kqid left, struct kqid right); extern qid_t from_kqid(struct user_namespace *to, struct kqid qid); extern qid_t from_kqid_munged(struct user_namespace *to, struct kqid qid); extern bool qid_valid(struct kqid qid); /** * make_kqid - Map a user-namespace, type, qid tuple into a kqid. * @from: User namespace that the qid is in * @type: The type of quota * @qid: Quota identifier * * Maps a user-namespace, type qid tuple into a kernel internal * kqid, and returns that kqid. * * When there is no mapping defined for the user-namespace, type, * qid tuple an invalid kqid is returned. Callers are expected to * test for and handle invalid kqids being returned. * Invalid kqids may be tested for using qid_valid(). */ static inline struct kqid make_kqid(struct user_namespace *from, enum quota_type type, qid_t qid) { struct kqid kqid; kqid.type = type; switch (type) { case USRQUOTA: kqid.uid = make_kuid(from, qid); break; case GRPQUOTA: kqid.gid = make_kgid(from, qid); break; case PRJQUOTA: kqid.projid = make_kprojid(from, qid); break; default: BUG(); } return kqid; } /** * make_kqid_invalid - Explicitly make an invalid kqid * @type: The type of quota identifier * * Returns an invalid kqid with the specified type. */ static inline struct kqid make_kqid_invalid(enum quota_type type) { struct kqid kqid; kqid.type = type; switch (type) { case USRQUOTA: kqid.uid = INVALID_UID; break; case GRPQUOTA: kqid.gid = INVALID_GID; break; case PRJQUOTA: kqid.projid = INVALID_PROJID; break; default: BUG(); } return kqid; } /** * make_kqid_uid - Make a kqid from a kuid * @uid: The kuid to make the quota identifier from */ static inline struct kqid make_kqid_uid(kuid_t uid) { struct kqid kqid; kqid.type = USRQUOTA; kqid.uid = uid; return kqid; } /** * make_kqid_gid - Make a kqid from a kgid * @gid: The kgid to make the quota identifier from */ static inline struct kqid make_kqid_gid(kgid_t gid) { struct kqid kqid; kqid.type = GRPQUOTA; kqid.gid = gid; return kqid; } /** * make_kqid_projid - Make a kqid from a projid * @projid: The kprojid to make the quota identifier from */ static inline struct kqid make_kqid_projid(kprojid_t projid) { struct kqid kqid; kqid.type = PRJQUOTA; kqid.projid = projid; return kqid; } /** * qid_has_mapping - Report if a qid maps into a user namespace. * @ns: The user namespace to see if a value maps into. * @qid: The kernel internal quota identifier to test. */ static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) { return from_kqid(ns, qid) != (qid_t) -1; } extern spinlock_t dq_data_lock; /* Maximal numbers of writes for quota operation (insert/delete/update) * (over VFS all formats) */ #define DQUOT_INIT_ALLOC max(V1_INIT_ALLOC, V2_INIT_ALLOC) #define DQUOT_INIT_REWRITE max(V1_INIT_REWRITE, V2_INIT_REWRITE) #define DQUOT_DEL_ALLOC max(V1_DEL_ALLOC, V2_DEL_ALLOC) #define DQUOT_DEL_REWRITE max(V1_DEL_REWRITE, V2_DEL_REWRITE) /* * Data for one user/group kept in memory */ struct mem_dqblk { qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */ qsize_t dqb_curspace; /* current used space */ qsize_t dqb_rsvspace; /* current reserved space for delalloc*/ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */ qsize_t dqb_isoftlimit; /* preferred inode limit */ qsize_t dqb_curinodes; /* current # allocated inodes */ time64_t dqb_btime; /* time limit for excessive disk use */ time64_t dqb_itime; /* time limit for excessive inode use */ }; /* * Data for one quotafile kept in memory */ struct quota_format_type; struct mem_dqinfo { struct quota_format_type *dqi_format; int dqi_fmt_id; /* Id of the dqi_format - used when turning * quotas on after remount RW */ struct list_head dqi_dirty_list; /* List of dirty dquots [dq_list_lock] */ unsigned long dqi_flags; /* DFQ_ flags [dq_data_lock] */ unsigned int dqi_bgrace; /* Space grace time [dq_data_lock] */ unsigned int dqi_igrace; /* Inode grace time [dq_data_lock] */ qsize_t dqi_max_spc_limit; /* Maximum space limit [static] */ qsize_t dqi_max_ino_limit; /* Maximum inode limit [static] */ void *dqi_priv; }; struct super_block; /* Mask for flags passed to userspace */ #define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE) /* Mask for flags modifiable from userspace */ #define DQF_SETINFO_MASK DQF_ROOT_SQUASH enum { DQF_INFO_DIRTY_B = DQF_PRIVATE, }; #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B) /* Is info dirty? */ extern void mark_info_dirty(struct super_block *sb, int type); static inline int info_dirty(struct mem_dqinfo *info) { return test_bit(DQF_INFO_DIRTY_B, &info->dqi_flags); } enum { DQST_LOOKUPS, DQST_DROPS, DQST_READS, DQST_WRITES, DQST_CACHE_HITS, DQST_ALLOC_DQUOTS, DQST_FREE_DQUOTS, DQST_SYNCS, _DQST_DQSTAT_LAST }; struct dqstats { unsigned long stat[_DQST_DQSTAT_LAST]; struct percpu_counter counter[_DQST_DQSTAT_LAST]; }; extern struct dqstats dqstats; static inline void dqstats_inc(unsigned int type) { percpu_counter_inc(&dqstats.counter[type]); } static inline void dqstats_dec(unsigned int type) { percpu_counter_dec(&dqstats.counter[type]); } #define DQ_MOD_B 0 /* dquot modified since read */ #define DQ_BLKS_B 1 /* uid/gid has been warned about blk limit */ #define DQ_INODES_B 2 /* uid/gid has been warned about inode limit */ #define DQ_FAKE_B 3 /* no limits only usage */ #define DQ_READ_B 4 /* dquot was read into memory */ #define DQ_ACTIVE_B 5 /* dquot is active (dquot_release not called) */ #define DQ_RELEASING_B 6 /* dquot is in releasing_dquots list waiting * to be cleaned up */ #define DQ_LASTSET_B 7 /* Following 6 bits (see QIF_) are reserved\ * for the mask of entries set via SETQUOTA\ * quotactl. They are set under dq_data_lock\ * and the quota format handling dquot can\ * clear them when it sees fit. */ struct dquot { struct hlist_node dq_hash; /* Hash list in memory [dq_list_lock] */ struct list_head dq_inuse; /* List of all quotas [dq_list_lock] */ struct list_head dq_free; /* Free list element [dq_list_lock] */ struct list_head dq_dirty; /* List of dirty dquots [dq_list_lock] */ struct mutex dq_lock; /* dquot IO lock */ spinlock_t dq_dqb_lock; /* Lock protecting dq_dqb changes */ atomic_t dq_count; /* Use count */ struct super_block *dq_sb; /* superblock this applies to */ struct kqid dq_id; /* ID this applies to (uid, gid, projid) */ loff_t dq_off; /* Offset of dquot on disk [dq_lock, stable once set] */ unsigned long dq_flags; /* See DQ_* */ struct mem_dqblk dq_dqb; /* Diskquota usage [dq_dqb_lock] */ }; /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ int (*read_file_info)(struct super_block *sb, int type); /* Read main info about file - called on quotaon() */ int (*write_file_info)(struct super_block *sb, int type); /* Write main info about file */ int (*free_file_info)(struct super_block *sb, int type); /* Called on quotaoff() */ int (*read_dqblk)(struct dquot *dquot); /* Read structure for one user */ int (*commit_dqblk)(struct dquot *dquot); /* Write structure for one user */ int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ int (*get_next_id)(struct super_block *sb, struct kqid *qid); /* Get next ID with existing structure in the quota file */ }; /* Operations working with dquots */ struct dquot_operations { int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ int (*acquire_dquot) (struct dquot *); /* Quota is going to be created on disk */ int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */ /* Get number of inodes that were charged for a given inode */ int (*get_inode_usage) (struct inode *, qsize_t *); /* Get next ID with active quota structure */ int (*get_next_id) (struct super_block *sb, struct kqid *qid); }; struct path; /* Structure for communicating via ->get_dqblk() & ->set_dqblk() */ struct qc_dqblk { int d_fieldmask; /* mask of fields to change in ->set_dqblk() */ u64 d_spc_hardlimit; /* absolute limit on used space */ u64 d_spc_softlimit; /* preferred limit on used space */ u64 d_ino_hardlimit; /* maximum # allocated inodes */ u64 d_ino_softlimit; /* preferred inode limit */ u64 d_space; /* Space owned by the user */ u64 d_ino_count; /* # inodes owned by the user */ s64 d_ino_timer; /* zero if within inode limits */ /* if not, we refuse service */ s64 d_spc_timer; /* similar to above; for space */ int d_ino_warns; /* # warnings issued wrt num inodes */ int d_spc_warns; /* # warnings issued wrt used space */ u64 d_rt_spc_hardlimit; /* absolute limit on realtime space */ u64 d_rt_spc_softlimit; /* preferred limit on RT space */ u64 d_rt_space; /* realtime space owned */ s64 d_rt_spc_timer; /* similar to above; for RT space */ int d_rt_spc_warns; /* # warnings issued wrt RT space */ }; /* * Field specifiers for ->set_dqblk() in struct qc_dqblk and also for * ->set_info() in struct qc_info */ #define QC_INO_SOFT (1<<0) #define QC_INO_HARD (1<<1) #define QC_SPC_SOFT (1<<2) #define QC_SPC_HARD (1<<3) #define QC_RT_SPC_SOFT (1<<4) #define QC_RT_SPC_HARD (1<<5) #define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \ QC_RT_SPC_SOFT | QC_RT_SPC_HARD) #define QC_SPC_TIMER (1<<6) #define QC_INO_TIMER (1<<7) #define QC_RT_SPC_TIMER (1<<8) #define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER) #define QC_SPC_WARNS (1<<9) #define QC_INO_WARNS (1<<10) #define QC_RT_SPC_WARNS (1<<11) #define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS) #define QC_SPACE (1<<12) #define QC_INO_COUNT (1<<13) #define QC_RT_SPACE (1<<14) #define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE) #define QC_FLAGS (1<<15) #define QCI_SYSFILE (1 << 0) /* Quota file is hidden from userspace */ #define QCI_ROOT_SQUASH (1 << 1) /* Root squash turned on */ #define QCI_ACCT_ENABLED (1 << 2) /* Quota accounting enabled */ #define QCI_LIMITS_ENFORCED (1 << 3) /* Quota limits enforced */ /* Structures for communicating via ->get_state */ struct qc_type_state { unsigned int flags; /* Flags QCI_* */ unsigned int spc_timelimit; /* Time after which space softlimit is * enforced */ unsigned int ino_timelimit; /* Ditto for inode softlimit */ unsigned int rt_spc_timelimit; /* Ditto for real-time space */ unsigned int spc_warnlimit; /* Limit for number of space warnings */ unsigned int ino_warnlimit; /* Ditto for inodes */ unsigned int rt_spc_warnlimit; /* Ditto for real-time space */ unsigned long long ino; /* Inode number of quota file */ blkcnt_t blocks; /* Number of 512-byte blocks in the file */ blkcnt_t nextents; /* Number of extents in the file */ }; struct qc_state { unsigned int s_incoredqs; /* Number of dquots in core */ struct qc_type_state s_state[MAXQUOTAS]; /* Per quota type information */ }; /* Structure for communicating via ->set_info */ struct qc_info { int i_fieldmask; /* mask of fields to change in ->set_info() */ unsigned int i_flags; /* Flags QCI_* */ unsigned int i_spc_timelimit; /* Time after which space softlimit is * enforced */ unsigned int i_ino_timelimit; /* Ditto for inode softlimit */ unsigned int i_rt_spc_timelimit;/* Ditto for real-time space */ unsigned int i_spc_warnlimit; /* Limit for number of space warnings */ unsigned int i_ino_warnlimit; /* Limit for number of inode warnings */ unsigned int i_rt_spc_warnlimit; /* Ditto for real-time space */ }; /* Operations handling requests from userspace */ struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, const struct path *); int (*quota_off)(struct super_block *, int); int (*quota_enable)(struct super_block *, unsigned int); int (*quota_disable)(struct super_block *, unsigned int); int (*quota_sync)(struct super_block *, int); int (*set_info)(struct super_block *, int, struct qc_info *); int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_nextdqblk)(struct super_block *, struct kqid *, struct qc_dqblk *); int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_state)(struct super_block *, struct qc_state *); int (*rm_xquota)(struct super_block *, unsigned int); }; struct quota_format_type { int qf_fmt_id; /* Quota format id */ const struct quota_format_ops *qf_ops; /* Operations of format */ struct module *qf_owner; /* Module implementing quota format */ struct quota_format_type *qf_next; }; /** * Quota state flags - they come in three flavors - for users, groups and projects. * * Actual typed flags layout: * USRQUOTA GRPQUOTA PRJQUOTA * DQUOT_USAGE_ENABLED 0x0001 0x0002 0x0004 * DQUOT_LIMITS_ENABLED 0x0008 0x0010 0x0020 * DQUOT_SUSPENDED 0x0040 0x0080 0x0100 * * Following bits are used for non-typed flags: * DQUOT_QUOTA_SYS_FILE 0x0200 * DQUOT_NEGATIVE_USAGE 0x0400 * DQUOT_NOLIST_DIRTY 0x0800 */ enum { _DQUOT_USAGE_ENABLED = 0, /* Track disk usage for users */ _DQUOT_LIMITS_ENABLED, /* Enforce quota limits for users */ _DQUOT_SUSPENDED, /* User diskquotas are off, but * we have necessary info in * memory to turn them on */ _DQUOT_STATE_FLAGS }; #define DQUOT_USAGE_ENABLED (1 << _DQUOT_USAGE_ENABLED * MAXQUOTAS) #define DQUOT_LIMITS_ENABLED (1 << _DQUOT_LIMITS_ENABLED * MAXQUOTAS) #define DQUOT_SUSPENDED (1 << _DQUOT_SUSPENDED * MAXQUOTAS) #define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \ DQUOT_SUSPENDED) /* Other quota flags */ #define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS) #define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST) /* Quota file is a special * system file and user cannot * touch it. Filesystem is * responsible for setting * S_NOQUOTA, S_NOATIME flags */ #define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1)) /* Allow negative quota usage */ /* Do not track dirty dquots in a list */ #define DQUOT_NOLIST_DIRTY (1 << (DQUOT_STATE_LAST + 2)) static inline unsigned int dquot_state_flag(unsigned int flags, int type) { return flags << type; } static inline unsigned int dquot_generic_flag(unsigned int flags, int type) { return (flags >> type) & DQUOT_STATE_FLAGS; } /* Bitmap of quota types where flag is set in flags */ static __always_inline unsigned dquot_state_types(unsigned flags, unsigned flag) { BUILD_BUG_ON_NOT_POWER_OF_2(flag); return (flags / flag) & ((1 << MAXQUOTAS) - 1); } #ifdef CONFIG_QUOTA_NETLINK_INTERFACE extern void quota_send_warning(struct kqid qid, dev_t dev, const char warntype); #else static inline void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { return; } #endif /* CONFIG_QUOTA_NETLINK_INTERFACE */ struct quota_info { unsigned int flags; /* Flags for diskquotas on this device */ struct rw_semaphore dqio_sem; /* Lock quota file while I/O in progress */ struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ const struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ }; void register_quota_format(struct quota_format_type *fmt); void unregister_quota_format(struct quota_format_type *fmt); struct quota_module_name { int qm_fmt_id; char *qm_mod_name; }; #define INIT_QUOTA_MODULE_NAMES {\ {QFMT_VFS_OLD, "quota_v1"},\ {QFMT_VFS_V0, "quota_v2"},\ {QFMT_VFS_V1, "quota_v2"},\ {0, NULL}} #endif /* _QUOTA_ */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_LWTUNNEL_H #define __NET_LWTUNNEL_H 1 #include <linux/lwtunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/route.h> #define LWTUNNEL_HASH_BITS 7 #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) /* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. */ enum { LWTUNNEL_XMIT_DONE, LWTUNNEL_XMIT_CONTINUE = 0x100, }; struct lwtunnel_state { __u16 type; __u16 flags; __u16 headroom; atomic_t refcnt; int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); struct rcu_head rcu; __u8 data[]; }; struct lwtunnel_encap_ops { int (*build_state)(struct net *net, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack); void (*destroy_state)(struct lwtunnel_state *lws); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); int (*xmit)(struct sk_buff *skb); struct module *owner; }; #ifdef CONFIG_LWTUNNEL DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); void lwtstate_free(struct lwtunnel_state *lws); static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { if (lws) atomic_inc(&lws->refcnt); return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; if (atomic_dec_and_test(&lws->refcnt)) lwtstate_free(lws); } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) return true; return false; } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { if ((lwtunnel_xmit_redirect(lwtstate) || lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu) return lwtstate->headroom; return 0; } int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress); static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { dst->lwtstate->orig_output = READ_ONCE(dst->output); WRITE_ONCE(dst->output, lwtunnel_output); } if (lwtunnel_input_redirect(dst->lwtstate)) { dst->lwtstate->orig_input = READ_ONCE(dst->input); WRITE_ONCE(dst->input, lwtunnel_input); } } #else static inline void lwtstate_free(struct lwtunnel_state *lws) { } static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline void lwtunnel_set_redirect(struct dst_entry *dst) { } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { return 0; } static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. */ return 0; } static inline int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { return 0; } static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { return 0; } static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) { return NULL; } static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { return 0; } static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_xmit(struct sk_buff *skb) { return -EOPNOTSUPP; } #endif /* CONFIG_LWTUNNEL */ #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #endif /* __NET_LWTUNNEL_H */ |
| 6 6 3 1 1 3 2 2 2 2 440 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation -- HMAC functions * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <crypto/sha1.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> #include <linux/random.h> struct hmac_storage { local_lock_t bh_lock; char hmac_ring[SEG6_HMAC_RING_SIZE]; }; static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct seg6_hmac_info *hinfo = obj; return (hinfo->hmackeyid != *(__u32 *)arg->key); } static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo) { kfree_rcu(hinfo, rcu); } static void seg6_free_hi(void *ptr, void *arg) { struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr; if (hinfo) seg6_hinfo_release(hinfo); } static const struct rhashtable_params rht_params = { .head_offset = offsetof(struct seg6_hmac_info, node), .key_offset = offsetof(struct seg6_hmac_info, hmackeyid), .key_len = sizeof(u32), .automatic_shrinking = true, .obj_cmpfn = seg6_hmac_cmpfn, }; static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5) return NULL; if (!sr_has_hmac(srh)) return NULL; tlv = (struct sr6_tlv_hmac *) ((char *)srh + ((srh->hdrlen + 1) << 3) - 40); if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38) return NULL; return tlv; } int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); int plen, i, ret = 0; char *ring, *off; /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; /* this limit allows for 14 segments */ if (plen >= SEG6_HMAC_RING_SIZE) return -EMSGSIZE; /* Let's build the HMAC text on the ring buffer. The text is composed * as follows, in order: * * 1. Source IPv6 address (128 bits) * 2. first_segment value (8 bits) * 3. Flags (8 bits) * 4. HMAC Key ID (32 bits) * 5. All segments in the segments list (n * 128 bits) */ local_bh_disable(); local_lock_nested_bh(&hmac_storage.bh_lock); ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ memcpy(off, saddr, 16); off += 16; /* first_segment value */ *off++ = hdr->first_segment; /* flags */ *off++ = hdr->flags; /* HMAC Key ID */ memcpy(off, &hmackeyid, 4); off += 4; /* all segments in the list */ for (i = 0; i < hdr->first_segment + 1; i++) { memcpy(off, hdr->segments + i, 16); off += 16; } switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: hmac_sha1(&hinfo->key.sha1, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); memset(&output[SHA1_DIGEST_SIZE], 0, SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); break; case SEG6_HMAC_ALGO_SHA256: hmac_sha256(&hinfo->key.sha256, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); return ret; } EXPORT_SYMBOL(seg6_hmac_compute); /* checks if an incoming SR-enabled packet's HMAC status matches * the incoming policy. * * called with rcu_read_lock() */ bool seg6_hmac_validate_skb(struct sk_buff *skb) { u8 hmac_output[SEG6_HMAC_FIELD_LEN]; struct net *net = dev_net(skb->dev); struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; struct ipv6_sr_hdr *srh; struct inet6_dev *idev; int require_hmac; idev = __in6_dev_get(skb->dev); if (!idev) return false; srh = (struct ipv6_sr_hdr *)skb_transport_header(skb); tlv = seg6_get_tlv_hmac(srh); require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac); /* mandatory check but no tlv */ if (require_hmac > 0 && !tlv) return false; /* no check */ if (require_hmac < 0) return true; /* check only if present */ if (require_hmac == 0 && !tlv) return true; /* now, seg6_require_hmac >= 0 && tlv */ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) return false; if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; } EXPORT_SYMBOL(seg6_hmac_validate_skb); /* called with rcu_read_lock() */ struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); return hinfo; } EXPORT_SYMBOL(seg6_hmac_info_lookup); int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) { struct seg6_pernet_data *sdata = seg6_pernet(net); int err; switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: hmac_sha1_preparekey(&hinfo->key.sha1, hinfo->secret, hinfo->slen); break; case SEG6_HMAC_ALGO_SHA256: hmac_sha256_preparekey(&hinfo->key.sha256, hinfo->secret, hinfo->slen); break; default: return -EINVAL; } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); return err; } EXPORT_SYMBOL(seg6_hmac_info_add); int seg6_hmac_info_del(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; int err = -ENOENT; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); if (!hinfo) goto out; err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node, rht_params); if (err) goto out; seg6_hinfo_release(hinfo); out: return err; } EXPORT_SYMBOL(seg6_hmac_info_del); int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh) { struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; int err = -ENOENT; tlv = seg6_get_tlv_hmac(srh); if (!tlv) return -EINVAL; rcu_read_lock(); hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) goto out; memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN); err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac); out: rcu_read_unlock(); return err; } EXPORT_SYMBOL(seg6_push_hmac); int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); return rhashtable_init(&sdata->hmac_infos, &rht_params); } void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL); } EXPORT_SYMBOL(seg6_hmac_net_exit); |
| 5 5 4 5 5 5 4 2 1 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 5 6 6 6 1 6 6 6 6 5 2 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | // SPDX-License-Identifier: GPL-2.0 /* * trace_events_inject - trace event injection * * Copyright (C) 2019 Cong Wang <cwang@twitter.com> */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/rculist.h> #include "trace.h" static int trace_inject_entry(struct trace_event_file *file, void *rec, int len) { struct trace_event_buffer fbuffer; int written = 0; void *entry; rcu_read_lock_sched(); entry = trace_event_buffer_reserve(&fbuffer, file, len); if (entry) { memcpy(entry, rec, len); written = len; trace_event_buffer_commit(&fbuffer); } rcu_read_unlock_sched(); return written; } static int parse_field(char *str, struct trace_event_call *call, struct ftrace_event_field **pf, u64 *pv) { struct ftrace_event_field *field; char *field_name; int s, i = 0; int len; u64 val; if (!str[i]) return 0; /* First find the field to associate to */ while (isspace(str[i])) i++; s = i; while (isalnum(str[i]) || str[i] == '_') i++; len = i - s; if (!len) return -EINVAL; field_name = kmemdup_nul(str + s, len, GFP_KERNEL); if (!field_name) return -ENOMEM; field = trace_find_event_field(call, field_name); kfree(field_name); if (!field) return -ENOENT; *pf = field; while (isspace(str[i])) i++; if (str[i] != '=') return -EINVAL; i++; while (isspace(str[i])) i++; s = i; if (isdigit(str[i]) || str[i] == '-') { char *num, c; int ret; /* Make sure the field is not a string */ if (is_string_field(field)) return -EINVAL; if (str[i] == '-') i++; /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; num = str + s; c = str[i]; if (c != '\0' && !isspace(c)) return -EINVAL; str[i] = '\0'; /* Make sure it is a value */ if (field->is_signed) ret = kstrtoll(num, 0, &val); else ret = kstrtoull(num, 0, &val); str[i] = c; if (ret) return ret; *pv = val; return i; } else if (str[i] == '\'' || str[i] == '"') { char q = str[i]; /* Make sure the field is OK for strings */ if (!is_string_field(field)) return -EINVAL; for (i++; str[i]; i++) { if (str[i] == '\\' && str[i + 1]) { i++; continue; } if (str[i] == q) break; } if (!str[i]) return -EINVAL; /* Skip quotes */ s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) return -EINVAL; *pv = (unsigned long)(str + s); str[i] = 0; /* go past the last quote */ i++; return i; } return -EINVAL; } static int trace_get_entry_size(struct trace_event_call *call) { struct ftrace_event_field *field; struct list_head *head; int size = 0; head = trace_get_fields(call); list_for_each_entry(field, head, link) { if (field->size + field->offset > size) size = field->size + field->offset; } return size; } static void *trace_alloc_entry(struct trace_event_call *call, int *size) { int entry_size = trace_get_entry_size(call); struct ftrace_event_field *field; struct list_head *head; void *entry = NULL; /* We need an extra '\0' at the end. */ entry = kzalloc(entry_size + 1, GFP_KERNEL); if (!entry) return NULL; head = trace_get_fields(call); list_for_each_entry(field, head, link) { if (!is_string_field(field)) continue; if (field->filter_type == FILTER_STATIC_STRING) continue; if (field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING) { u32 *str_item; int str_loc = entry_size & 0xffff; if (field->filter_type == FILTER_RDYN_STRING) str_loc -= field->offset + field->size; str_item = (u32 *)(entry + field->offset); *str_item = str_loc; /* string length is 0. */ } else { char **paddr; paddr = (char **)(entry + field->offset); *paddr = ""; } } *size = entry_size + 1; return entry; } #define INJECT_STRING "STATIC STRING CAN NOT BE INJECTED" /* Caller is responsible to free the *pentry. */ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) { struct ftrace_event_field *field; void *entry = NULL; int entry_size; u64 val = 0; int len; entry = trace_alloc_entry(call, &entry_size); *pentry = entry; if (!entry) return -ENOMEM; tracing_generic_entry_update(entry, call->event.type, tracing_gen_ctx()); while ((len = parse_field(str, call, &field, &val)) > 0) { if (is_function_field(field)) return -EINVAL; if (is_string_field(field)) { char *addr = (char *)(unsigned long) val; if (field->filter_type == FILTER_STATIC_STRING) { strscpy(entry + field->offset, addr, field->size); } else if (field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; int str_loc = entry_size & 0xffff; u32 *str_item; entry_size += str_len; *pentry = krealloc(entry, entry_size, GFP_KERNEL); if (!*pentry) { kfree(entry); return -ENOMEM; } entry = *pentry; strscpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); if (field->filter_type == FILTER_RDYN_STRING) str_loc -= field->offset + field->size; *str_item = (str_len << 16) | str_loc; } else { char **paddr; paddr = (char **)(entry + field->offset); *paddr = INJECT_STRING; } } else { switch (field->size) { case 1: { u8 tmp = (u8) val; memcpy(entry + field->offset, &tmp, 1); break; } case 2: { u16 tmp = (u16) val; memcpy(entry + field->offset, &tmp, 2); break; } case 4: { u32 tmp = (u32) val; memcpy(entry + field->offset, &tmp, 4); break; } case 8: memcpy(entry + field->offset, &val, 8); break; default: return -EINVAL; } } str += len; } if (len < 0) return len; return entry_size; } static ssize_t event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_call *call; struct trace_event_file *file; int err = -ENODEV, size; void *entry = NULL; char *buf; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); strim(buf); mutex_lock(&event_mutex); file = event_file_file(filp); if (file) { call = file->event_call; size = parse_entry(buf, call, &entry); if (size < 0) err = size; else err = trace_inject_entry(file, entry, size); } mutex_unlock(&event_mutex); kfree(entry); kfree(buf); if (err < 0) return err; *ppos += err; return cnt; } static ssize_t event_inject_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { return -EPERM; } const struct file_operations event_inject_fops = { .open = tracing_open_file_tr, .read = event_inject_read, .write = event_inject_write, .release = tracing_release_file_tr, }; |
| 14 15 55 55 55 55 55 10 52 52 3 52 52 3 52 52 3 52 52 3 52 52 7 6 7 6 7 6 7 6 7 7 25 25 24 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ /* just for IFNAMSIZ */ #include <linux/if.h> #include <linux/slab.h> #include <linux/export.h> #include "led.h" void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { if (!atomic_read(&local->assoc_led_active)) return; if (associated) led_trigger_event(&local->assoc_led, LED_FULL); else led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { if (!atomic_read(&local->radio_led_active)) return; if (enabled) led_trigger_event(&local->radio_led, LED_FULL); else led_trigger_event(&local->radio_led, LED_OFF); } void ieee80211_alloc_led_names(struct ieee80211_local *local) { local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", wiphy_name(local->hw.wiphy)); local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", wiphy_name(local->hw.wiphy)); local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", wiphy_name(local->hw.wiphy)); local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", wiphy_name(local->hw.wiphy)); } void ieee80211_free_led_names(struct ieee80211_local *local) { kfree(local->rx_led.name); kfree(local->tx_led.name); kfree(local->assoc_led.name); kfree(local->radio_led.name); } static int ieee80211_tx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_inc(&local->tx_led_active); return 0; } static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_dec(&local->tx_led_active); } static int ieee80211_rx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_inc(&local->rx_led_active); return 0; } static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_dec(&local->rx_led_active); } static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_inc(&local->assoc_led_active); return 0; } static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_dec(&local->assoc_led_active); } static int ieee80211_radio_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_inc(&local->radio_led_active); return 0; } static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_dec(&local->radio_led_active); } static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_inc(&local->tpt_led_active); return 0; } static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { atomic_set(&local->rx_led_active, 0); local->rx_led.activate = ieee80211_rx_led_activate; local->rx_led.deactivate = ieee80211_rx_led_deactivate; if (local->rx_led.name && led_trigger_register(&local->rx_led)) { kfree(local->rx_led.name); local->rx_led.name = NULL; } atomic_set(&local->tx_led_active, 0); local->tx_led.activate = ieee80211_tx_led_activate; local->tx_led.deactivate = ieee80211_tx_led_deactivate; if (local->tx_led.name && led_trigger_register(&local->tx_led)) { kfree(local->tx_led.name); local->tx_led.name = NULL; } atomic_set(&local->assoc_led_active, 0); local->assoc_led.activate = ieee80211_assoc_led_activate; local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { kfree(local->assoc_led.name); local->assoc_led.name = NULL; } atomic_set(&local->radio_led_active, 0); local->radio_led.activate = ieee80211_radio_led_activate; local->radio_led.deactivate = ieee80211_radio_led_deactivate; if (local->radio_led.name && led_trigger_register(&local->radio_led)) { kfree(local->radio_led.name); local->radio_led.name = NULL; } atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { local->tpt_led.activate = ieee80211_tpt_led_activate; local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } } } void ieee80211_led_exit(struct ieee80211_local *local) { if (local->radio_led.name) led_trigger_unregister(&local->radio_led); if (local->assoc_led.name) led_trigger_unregister(&local->assoc_led); if (local->tx_led.name) led_trigger_unregister(&local->tx_led); if (local->rx_led.name) led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); static unsigned long tpt_trig_traffic(struct ieee80211_local *local, struct tpt_led_trigger *tpt_trig) { unsigned long traffic, delta; traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes; delta = traffic - tpt_trig->prev_traffic; tpt_trig->prev_traffic = traffic; return DIV_ROUND_UP(delta, 1024 / 8); } static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = timer_container_of(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; unsigned long on, off, tpt; int i; if (!tpt_trig->running) return; mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); tpt = tpt_trig_traffic(local, tpt_trig); /* default to just solid on */ on = 1; off = 0; for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) { if (tpt_trig->blink_table[i].throughput < 0 || tpt > tpt_trig->blink_table[i].throughput) { off = tpt_trig->blink_table[i].blink_time / 2; on = tpt_trig->blink_table[i].blink_time - off; break; } } led_trigger_blink(&local->tpt_led, on, off); } const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; if (WARN_ON(local->tpt_led_trigger)) return NULL; tpt_trig = kzalloc_obj(struct tpt_led_trigger); if (!tpt_trig) return NULL; snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; tpt_trig->local = local; timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; return tpt_trig->name; } EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger); static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (tpt_trig->running) return; /* reset traffic */ tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (!tpt_trig->running) return; tpt_trig->running = false; timer_delete_sync(&tpt_trig->timer); led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; bool allowed; WARN_ON(types_on & types_off); if (!tpt_trig) return; tpt_trig->active &= ~types_off; tpt_trig->active |= types_on; /* * Regardless of wanted state, we shouldn't blink when * the radio is disabled -- this can happen due to some * code ordering issues with __ieee80211_recalc_idle() * being called before the radio is started. */ allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO; if (!allowed || !(tpt_trig->active & tpt_trig->want)) ieee80211_stop_tpt_led_trig(local); else ieee80211_start_tpt_led_trig(local); } |
| 1 4 1 3 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi_bus.h - ACPI Bus Driver ($Revision: 22 $) * * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef __ACPI_BUS_H__ #define __ACPI_BUS_H__ #include <linux/completion.h> #include <linux/container_of.h> #include <linux/device.h> #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/property.h> #include <linux/types.h> struct notifier_block; struct acpi_handle_list { u32 count; acpi_handle *handles; }; /* acpi_utils.h */ acpi_status acpi_extract_package(union acpi_object *package, struct acpi_buffer *format, struct acpi_buffer *buffer); acpi_status acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, unsigned long long *data); bool acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, struct acpi_handle_list *list); bool acpi_handle_list_equal(struct acpi_handle_list *list1, struct acpi_handle_list *list2); void acpi_handle_list_replace(struct acpi_handle_list *dst, struct acpi_handle_list *src); void acpi_handle_list_free(struct acpi_handle_list *list); bool acpi_device_dep(acpi_handle target, acpi_handle match); acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); acpi_status acpi_evaluate_ej0(acpi_handle handle); acpi_status acpi_evaluate_lck(acpi_handle handle, int lock); acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function); bool acpi_ata_match(acpi_handle handle); bool acpi_bay_match(acpi_handle handle); bool acpi_dock_match(acpi_handle handle); bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); #ifdef CONFIG_ACPI bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { union acpi_object *obj; obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4); if (obj && obj->type != type) { ACPI_FREE(obj); obj = NULL; } return obj; } #endif #define ACPI_INIT_DSM_ARGV4(cnt, eles) \ { \ .package.type = ACPI_TYPE_PACKAGE, \ .package.count = (cnt), \ .package.elements = (eles) \ } bool acpi_dev_found(const char *hid); bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); bool acpi_reduced_hardware(void); #ifdef CONFIG_ACPI struct proc_dir_entry; #define ACPI_BUS_FILE_ROOT "acpi" extern struct proc_dir_entry *acpi_root_dir; enum acpi_bus_device_type { ACPI_BUS_TYPE_DEVICE = 0, ACPI_BUS_TYPE_POWER, ACPI_BUS_TYPE_PROCESSOR, ACPI_BUS_TYPE_THERMAL, ACPI_BUS_TYPE_POWER_BUTTON, ACPI_BUS_TYPE_SLEEP_BUTTON, ACPI_BUS_TYPE_ECDT_EC, ACPI_BUS_DEVICE_TYPE_COUNT }; struct acpi_driver; struct acpi_device; /* * ACPI Scan Handler * ----------------- */ struct acpi_hotplug_profile { struct kobject kobj; int (*scan_dependent)(struct acpi_device *adev); void (*notify_online)(struct acpi_device *adev); bool enabled:1; bool demand_offline:1; }; static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile( struct kobject *kobj) { return container_of(kobj, struct acpi_hotplug_profile, kobj); } struct acpi_scan_handler { struct list_head list_node; const struct acpi_device_id *ids; bool (*match)(const char *idstr, const struct acpi_device_id **matchid); int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); void (*detach)(struct acpi_device *dev); void (*post_eject)(struct acpi_device *dev); void (*bind)(struct device *phys_dev); void (*unbind)(struct device *phys_dev); struct acpi_hotplug_profile hotplug; }; /* * ACPI Hotplug Context * -------------------- */ typedef int (*acpi_hp_notify) (struct acpi_device *, u32); typedef void (*acpi_hp_uevent) (struct acpi_device *, u32); typedef void (*acpi_hp_fixup) (struct acpi_device *); struct acpi_hotplug_context { struct acpi_device *self; acpi_hp_notify notify; acpi_hp_uevent uevent; acpi_hp_fixup fixup; }; /* * ACPI Driver * ----------- */ typedef int (*acpi_op_add) (struct acpi_device * device); typedef void (*acpi_op_remove) (struct acpi_device *device); typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event); struct acpi_device_ops { acpi_op_add add; acpi_op_remove remove; acpi_op_notify notify; }; #define ACPI_DRIVER_ALL_NOTIFY_EVENTS 0x1 /* system AND device events */ struct acpi_driver { char name[80]; char class[80]; const struct acpi_device_id *ids; /* Supported Hardware IDs */ unsigned int flags; struct acpi_device_ops ops; struct device_driver drv; }; /* * ACPI Device * ----------- */ /* Status (_STA) */ struct acpi_device_status { u32 present:1; u32 enabled:1; u32 show_in_ui:1; u32 functional:1; u32 battery_present:1; u32 reserved:27; }; /* Flags */ struct acpi_device_flags { u32 dynamic_status:1; u32 removable:1; u32 ejectable:1; u32 power_manageable:1; u32 match_driver:1; u32 initialized:1; u32 visited:1; u32 hotplug_notify:1; u32 is_dock_station:1; u32 of_compatible_ok:1; u32 coherent_dma:1; u32 cca_seen:1; u32 enumeration_by_parent:1; u32 honor_deps:1; u32 reserved:18; }; /* File System */ struct acpi_device_dir { struct proc_dir_entry *entry; }; #define acpi_device_dir(d) ((d)->dir.entry) /* Plug and Play */ #define MAX_ACPI_DEVICE_NAME_LEN 40 #define MAX_ACPI_CLASS_NAME_LEN 20 typedef char acpi_bus_id[8]; typedef u64 acpi_bus_address; typedef char acpi_device_name[MAX_ACPI_DEVICE_NAME_LEN]; typedef char acpi_device_class[MAX_ACPI_CLASS_NAME_LEN]; struct acpi_hardware_id { struct list_head list; const char *id; }; struct acpi_pnp_type { u32 hardware_id:1; u32 bus_address:1; u32 platform_id:1; u32 backlight:1; u32 reserved:28; }; struct acpi_device_pnp { acpi_bus_id bus_id; /* Object name */ int instance_no; /* Instance number of this object */ struct acpi_pnp_type type; /* ID type */ acpi_bus_address bus_address; /* _ADR */ char *unique_id; /* _UID */ struct list_head ids; /* _HID and _CIDs */ acpi_device_name device_name; /* Driver-determined */ acpi_device_class device_class; /* " */ }; #define acpi_device_bid(d) ((d)->pnp.bus_id) #define acpi_device_adr(d) ((d)->pnp.bus_address) const char *acpi_device_hid(struct acpi_device *device); #define acpi_device_uid(d) ((d)->pnp.unique_id) #define acpi_device_name(d) ((d)->pnp.device_name) #define acpi_device_class(d) ((d)->pnp.device_class) /* Power Management */ struct acpi_device_power_flags { u32 explicit_get:1; /* _PSC present? */ u32 power_resources:1; /* Power resources */ u32 inrush_current:1; /* Serialize Dx->D0 */ u32 power_removed:1; /* Optimize Dx->D0 */ u32 ignore_parent:1; /* Power is independent of parent power state */ u32 dsw_present:1; /* _DSW present? */ u32 reserved:26; }; struct acpi_device_power_state { struct list_head resources; /* Power resources referenced */ struct { u8 valid:1; u8 explicit_set:1; /* _PSx present? */ u8 reserved:6; } flags; int power; /* % Power (compared to D0) */ int latency; /* Dx->D0 time (microseconds) */ }; struct acpi_device_power { int state; /* Current state */ struct acpi_device_power_flags flags; struct acpi_device_power_state states[ACPI_D_STATE_COUNT]; /* Power states (D0-D3Cold) */ u8 state_for_enumeration; /* Deepest power state for enumeration */ }; struct acpi_dep_data { struct list_head node; acpi_handle supplier; acpi_handle consumer; bool honor_dep; bool met; bool free_when_met; }; /* Performance Management */ struct acpi_device_perf_flags { u8 reserved:8; }; struct acpi_device_perf_state { struct { u8 valid:1; u8 reserved:7; } flags; u8 power; /* % Power (compared to P0) */ u8 performance; /* % Performance ( " ) */ int latency; /* Px->P0 time (microseconds) */ }; struct acpi_device_perf { int state; struct acpi_device_perf_flags flags; int state_count; struct acpi_device_perf_state *states; }; /* Wakeup Management */ struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; struct acpi_device_wakeup_context { void (*func)(struct acpi_device_wakeup_context *context); struct device *dev; }; struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct list_head resources; struct acpi_device_wakeup_flags flags; struct acpi_device_wakeup_context context; struct wakeup_source *ws; int prepare_count; int enable_count; }; struct acpi_device_physical_node { struct list_head node; struct device *dev; unsigned int node_id; bool put_online:1; }; struct acpi_device_properties { struct list_head list; const guid_t *guid; union acpi_object *properties; void **bufs; }; /* ACPI Device Specific Data (_DSD) */ struct acpi_device_data { const union acpi_object *pointer; struct list_head properties; const union acpi_object *of_compatible; struct list_head subnodes; }; struct acpi_gpio_mapping; #define ACPI_DEVICE_SWNODE_ROOT 0 /* * The maximum expected number of CSI-2 data lanes. * * This number is not expected to ever have to be equal to or greater than the * number of bits in an unsigned long variable, but if it needs to be increased * above that limit, code will need to be adjusted accordingly. */ #define ACPI_DEVICE_CSI2_DATA_LANES 8 #define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH 8 enum acpi_device_swnode_dev_props { ACPI_DEVICE_SWNODE_DEV_ROTATION, ACPI_DEVICE_SWNODE_DEV_CLOCK_FREQUENCY, ACPI_DEVICE_SWNODE_DEV_LED_MAX_MICROAMP, ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_MICROAMP, ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_TIMEOUT_US, ACPI_DEVICE_SWNODE_DEV_NUM_OF, ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES }; enum acpi_device_swnode_port_props { ACPI_DEVICE_SWNODE_PORT_REG, ACPI_DEVICE_SWNODE_PORT_NUM_OF, ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES }; enum acpi_device_swnode_ep_props { ACPI_DEVICE_SWNODE_EP_REMOTE_EP, ACPI_DEVICE_SWNODE_EP_BUS_TYPE, ACPI_DEVICE_SWNODE_EP_REG, ACPI_DEVICE_SWNODE_EP_CLOCK_LANES, ACPI_DEVICE_SWNODE_EP_DATA_LANES, ACPI_DEVICE_SWNODE_EP_LANE_POLARITIES, /* TX only */ ACPI_DEVICE_SWNODE_EP_LINK_FREQUENCIES, ACPI_DEVICE_SWNODE_EP_NUM_OF, ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES }; /* * Each device has a root software node plus two times as many nodes as the * number of CSI-2 ports. */ #define ACPI_DEVICE_SWNODE_PORT(port) (2 * (port) + 1) #define ACPI_DEVICE_SWNODE_EP(endpoint) \ (ACPI_DEVICE_SWNODE_PORT(endpoint) + 1) /** * struct acpi_device_software_node_port - MIPI DisCo for Imaging CSI-2 port * @port_name: Port name. * @data_lanes: "data-lanes" property values. * @lane_polarities: "lane-polarities" property values. * @link_frequencies: "link_frequencies" property values. * @port_nr: Port number. * @crs_crs2_local: _CRS CSI2 record present (i.e. this is a transmitter one). * @port_props: Port properties. * @ep_props: Endpoint properties. * @remote_ep: Reference to the remote endpoint. */ struct acpi_device_software_node_port { char port_name[ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH + 1]; u32 data_lanes[ACPI_DEVICE_CSI2_DATA_LANES]; u32 lane_polarities[ACPI_DEVICE_CSI2_DATA_LANES + 1 /* clock lane */]; u64 link_frequencies[ACPI_DEVICE_CSI2_DATA_LANES]; unsigned int port_nr; bool crs_csi2_local; struct property_entry port_props[ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES]; struct property_entry ep_props[ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES]; struct software_node_ref_args remote_ep[1]; }; /** * struct acpi_device_software_nodes - Software nodes for an ACPI device * @dev_props: Device properties. * @nodes: Software nodes for root as well as ports and endpoints. * @nodeprts: Array of software node pointers, for (un)registering them. * @ports: Information related to each port and endpoint within a port. * @num_ports: The number of ports. */ struct acpi_device_software_nodes { struct property_entry dev_props[ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES]; struct software_node *nodes; const struct software_node **nodeptrs; struct acpi_device_software_node_port *ports; unsigned int num_ports; }; /* Device */ struct acpi_device { u32 pld_crc; int device_type; acpi_handle handle; /* no handle for fixed hardware */ struct fwnode_handle fwnode; struct list_head wakeup_list; struct list_head del_list; struct acpi_device_status status; struct acpi_device_flags flags; struct acpi_device_pnp pnp; struct acpi_device_power power; struct acpi_device_wakeup wakeup; struct acpi_device_perf performance; struct acpi_device_dir dir; struct acpi_device_data data; struct acpi_scan_handler *handler; struct acpi_hotplug_context *hp; struct acpi_device_software_nodes *swnodes; const struct acpi_gpio_mapping *driver_gpios; void *driver_data; struct device dev; unsigned int physical_node_count; unsigned int dep_unmet; struct list_head physical_node_list; struct mutex physical_node_lock; void (*remove)(struct acpi_device *); }; /* Non-device subnode */ struct acpi_data_node { struct list_head sibling; const char *name; acpi_handle handle; struct fwnode_handle fwnode; struct fwnode_handle *parent; struct acpi_device_data data; struct kobject kobj; struct completion kobj_done; }; extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode); bool is_acpi_data_node(const struct fwnode_handle *fwnode); static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \ \ is_acpi_device_node(__to_acpi_device_node_fwnode) ? \ container_of(__to_acpi_device_node_fwnode, \ struct acpi_device, fwnode) : \ NULL; \ }) #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ \ is_acpi_data_node(__to_acpi_data_node_fwnode) ? \ container_of(__to_acpi_data_node_fwnode, \ struct acpi_data_node, fwnode) : \ NULL; \ }) static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_static_fwnode_ops; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return is_acpi_data_node(fwnode) ? (!strcmp(to_acpi_data_node(fwnode)->name, name)) : false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return &adev->fwnode; } static inline void *acpi_driver_data(struct acpi_device *d) { return d->driver_data; } #define to_acpi_device(d) container_of(d, struct acpi_device, dev) #define to_acpi_driver(d) container_of_const(d, struct acpi_driver, drv) static inline struct acpi_device *acpi_dev_parent(struct acpi_device *adev) { if (adev->dev.parent) return to_acpi_device(adev->dev.parent); return NULL; } static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta) { *((u32 *)&adev->status) = sta; } static inline void acpi_set_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp) { hp->self = adev; adev->hp = hp; } void acpi_initialize_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp, acpi_hp_notify notify, acpi_hp_uevent uevent); /* acpi_device.dev.bus == &acpi_bus_type */ extern const struct bus_type acpi_bus_type; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); /* * Events * ------ */ struct acpi_bus_event { struct list_head node; acpi_device_class device_class; acpi_bus_id bus_id; u32 type; u32 data; }; #define ACPI_AC_CLASS "ac_adapter" extern struct kobject *acpi_kobj; extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int); void acpi_bus_private_data_handler(acpi_handle, void *); int acpi_bus_get_private_data(acpi_handle, void **); int acpi_bus_attach_private_data(acpi_handle, void *); void acpi_bus_detach_private_data(acpi_handle); int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler, void *context); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler); extern int acpi_notifier_call_chain(const char *device_class, const char *bus_id, u32 type, u32 data); extern int register_acpi_notifier(struct notifier_block *); extern int unregister_acpi_notifier(struct notifier_block *); /* * External Functions */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta); int acpi_bus_get_status(struct acpi_device *device); int acpi_bus_set_power(acpi_handle handle, int state); const char *acpi_power_state_string(int state); int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); void acpi_device_fix_up_power_extended(struct acpi_device *adev); void acpi_device_fix_up_power_children(struct acpi_device *adev); int acpi_bus_update_power(acpi_handle handle, int *state_p); int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); void acpi_dev_power_up_children_with_adr(struct acpi_device *adev); u8 acpi_dev_power_state_for_wake(struct acpi_device *adev); int acpi_device_power_add_dependent(struct acpi_device *adev, struct device *dev); void acpi_device_power_remove_dependent(struct acpi_device *adev, struct device *dev); #ifdef CONFIG_PM bool acpi_bus_can_wakeup(acpi_handle handle); #else static inline bool acpi_bus_can_wakeup(acpi_handle handle) { return false; } #endif void acpi_scan_lock_acquire(void); void acpi_scan_lock_release(void); void acpi_lock_hp_context(void); void acpi_unlock_hp_context(void); int acpi_scan_add_handler(struct acpi_scan_handler *handler); /* * use a macro to avoid include chaining to get THIS_MODULE */ #define acpi_bus_register_driver(drv) \ __acpi_bus_register_driver(drv, THIS_MODULE) int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner); void acpi_bus_unregister_driver(struct acpi_driver *driver); int acpi_bus_scan(acpi_handle handle); void acpi_bus_trim(struct acpi_device *start); acpi_status acpi_bus_get_ejd(acpi_handle handle, acpi_handle * ejd); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids); void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len); static inline bool acpi_device_enumerated(struct acpi_device *adev) { return adev && adev->flags.initialized && adev->flags.visited; } /** * module_acpi_driver(acpi_driver) - Helper macro for registering an ACPI driver * @__acpi_driver: acpi_driver struct * * Helper macro for ACPI drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_acpi_driver(__acpi_driver) \ module_driver(__acpi_driver, acpi_bus_register_driver, \ acpi_bus_unregister_driver) /* * Bind physical devices with ACPI devices */ struct acpi_bus_type { struct list_head list; const char *name; bool (*match)(struct device *dev); struct acpi_device * (*find_companion)(struct device *); void (*setup)(struct device *); }; int register_acpi_bus_type(struct acpi_bus_type *); int unregister_acpi_bus_type(struct acpi_bus_type *); int acpi_bind_one(struct device *dev, struct acpi_device *adev); int acpi_unbind_one(struct device *dev); enum acpi_bridge_type { ACPI_BRIDGE_TYPE_PCIE = 1, ACPI_BRIDGE_TYPE_CXL, }; struct acpi_pci_root { struct acpi_device * device; struct pci_bus *bus; u16 segment; int bridge_type; struct resource secondary; /* downstream bus range */ u32 osc_support_set; /* _OSC state of support bits */ u32 osc_control_set; /* _OSC state of control bits */ u32 osc_ext_support_set; /* _OSC state of extended support bits */ u32 osc_ext_control_set; /* _OSC state of extended control bits */ phys_addr_t mcfg_addr; }; /* helper */ struct iommu_ops; bool acpi_dma_supported(const struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_iommu_fwspec_init(struct device *dev, u32 id, struct fwnode_handle *fwnode); int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map); int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id); static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return acpi_dma_configure_id(dev, attr, NULL); } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr); int acpi_is_root_bridge(acpi_handle); struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state); int acpi_disable_wakeup_device_power(struct acpi_device *dev); #ifdef CONFIG_X86 bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status); bool acpi_quirk_skip_acpi_ac_and_battery(void); int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip); #else static inline bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status) { return false; } static inline bool acpi_quirk_skip_acpi_ac_and_battery(void) { return false; } static inline int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip) { *skip = false; return 0; } #endif #if IS_ENABLED(CONFIG_X86_ANDROID_TABLETS) bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev); bool acpi_quirk_skip_gpio_event_handlers(void); #else static inline bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev) { return false; } static inline bool acpi_quirk_skip_gpio_event_handlers(void) { return false; } #endif #ifdef CONFIG_PM void acpi_pm_wakeup_event(struct device *dev); acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)); acpi_status acpi_remove_pm_notifier(struct acpi_device *adev); bool acpi_pm_device_can_wakeup(struct device *dev); int acpi_pm_device_sleep_state(struct device *, int *, int); int acpi_pm_set_device_wakeup(struct device *dev, bool enable); #else static inline void acpi_pm_wakeup_event(struct device *dev) { } static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)) { return AE_SUPPORT; } static inline acpi_status acpi_remove_pm_notifier(struct acpi_device *adev) { return AE_SUPPORT; } static inline bool acpi_pm_device_can_wakeup(struct device *dev) { return false; } static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m) { if (p) *p = ACPI_STATE_D0; return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ? m : ACPI_STATE_D0; } static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT bool acpi_sleep_state_supported(u8 sleep_state); #else static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } #endif #ifdef CONFIG_ACPI_SLEEP u32 acpi_target_system_state(void); #else static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; } #endif static inline bool acpi_device_power_manageable(struct acpi_device *adev) { return adev->flags.power_manageable; } static inline bool acpi_device_can_wakeup(struct acpi_device *adev) { return adev->wakeup.flags.valid; } static inline bool acpi_device_can_poweroff(struct acpi_device *adev) { return adev->power.states[ACPI_STATE_D3_COLD].flags.valid || ((acpi_gbl_FADT.header.revision < 6) && adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set); } int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer); static inline bool acpi_dev_hid_match(struct acpi_device *adev, const char *hid2) { const char *hid1 = acpi_device_hid(adev); return hid1 && hid2 && !strcmp(hid1, hid2); } static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2) { const char *uid1 = acpi_device_uid(adev); return uid1 && uid2 && !strcmp(uid1, uid2); } static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) { u64 uid1; return !acpi_dev_uid_to_integer(adev, &uid1) && uid1 == uid2; } #define TYPE_ENTRY(type, x) \ const type: x, \ type: x #define ACPI_STR_TYPES(match) \ TYPE_ENTRY(unsigned char *, match), \ TYPE_ENTRY(signed char *, match), \ TYPE_ENTRY(char *, match), \ TYPE_ENTRY(void *, match) /** * acpi_dev_uid_match - Match device by supplied UID * @adev: ACPI device to match. * @uid2: Unique ID of the device. * * Matches UID in @adev with given @uid2. * * Returns: %true if matches, %false otherwise. */ #define acpi_dev_uid_match(adev, uid2) \ _Generic(uid2, \ /* Treat @uid2 as a string for acpi string types */ \ ACPI_STR_TYPES(acpi_str_uid_match), \ /* Treat as an integer otherwise */ \ default: acpi_int_uid_match)(adev, uid2) /** * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. * @uid2: Unique ID of the device, pass NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * * Returns: %true if matches or @uid2 is NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ /* Distinguish integer 0 from NULL @uid2 */ \ (_Generic(uid2, ACPI_STR_TYPES(!(uid2)), default: 0) || \ acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier, struct acpi_device *start); /** * for_each_acpi_consumer_dev - iterate over the consumer ACPI devices for a * given supplier * @supplier: Pointer to the supplier's ACPI device * @consumer: Pointer to &struct acpi_device to hold the consumer, initially NULL */ #define for_each_acpi_consumer_dev(supplier, consumer) \ for (consumer = acpi_dev_get_next_consumer_dev(supplier, NULL); \ consumer; \ consumer = acpi_dev_get_next_consumer_dev(supplier, consumer)) struct acpi_device * acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv); struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv); /** * for_each_acpi_dev_match - iterate over ACPI devices that matching the criteria * @adev: pointer to the matching ACPI device, NULL at the end of the loop * @hid: Hardware ID of the device. * @uid: Unique ID of the device, pass NULL to not check _UID * @hrv: Hardware Revision of the device, pass -1 to not check _HRV * * The caller is responsible for invoking acpi_dev_put() on the returned device. */ #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = acpi_dev_get_first_match_dev(hid, uid, hrv); \ adev; \ adev = acpi_dev_get_next_match_dev(adev, hid, uid, hrv)) static inline struct acpi_device *acpi_dev_get(struct acpi_device *adev) { return adev ? to_acpi_device(get_device(&adev->dev)) : NULL; } static inline void acpi_dev_put(struct acpi_device *adev) { if (adev) put_device(&adev->dev); } struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle); struct acpi_device *acpi_get_acpi_dev(acpi_handle handle); static inline void acpi_put_acpi_dev(struct acpi_device *adev) { acpi_dev_put(adev); } int acpi_wait_for_acpi_ipmi(void); int acpi_scan_add_dep(acpi_handle handle, struct acpi_handle_list *dep_devices); u32 arch_acpi_add_auto_dep(acpi_handle handle); #else /* CONFIG_ACPI */ static inline int register_acpi_bus_type(void *bus) { return 0; } static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } static inline const char *acpi_device_hid(struct acpi_device *device) { return ""; } static inline bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) { return false; } #define for_each_acpi_consumer_dev(supplier, consumer) \ for (consumer = NULL; false && (supplier);) #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = NULL; false && (hid) && (uid) && (hrv); ) #endif /* CONFIG_ACPI */ #endif /*__ACPI_BUS_H__*/ |
| 41 41 41 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 | // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * This is the HCI over NCI implementation, as specified in the 10.2 * section of the NCI 1.1 specification. * * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved. */ #include <linux/skbuff.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> #include <linux/kcov.h> struct nci_data { u8 conn_id; u8 pipe; u8 cmd; const u8 *data; u32 data_len; } __packed; struct nci_hci_create_pipe_params { u8 src_gate; u8 dest_host; u8 dest_gate; } __packed; struct nci_hci_create_pipe_resp { u8 src_host; u8 src_gate; u8 dest_host; u8 dest_gate; u8 pipe; } __packed; struct nci_hci_delete_pipe_noti { u8 pipe; } __packed; struct nci_hci_all_pipe_cleared_noti { u8 host; } __packed; struct nci_hcp_message { u8 header; /* type -cmd,evt,rsp- + instruction */ u8 data[]; } __packed; struct nci_hcp_packet { u8 header; /* cbit+pipe */ struct nci_hcp_message message; } __packed; #define NCI_HCI_ANY_SET_PARAMETER 0x01 #define NCI_HCI_ANY_GET_PARAMETER 0x02 #define NCI_HCI_ANY_CLOSE_PIPE 0x04 #define NCI_HCI_ADM_CLEAR_ALL_PIPE 0x14 #define NCI_HFP_NO_CHAINING 0x80 #define NCI_NFCEE_ID_HCI 0x80 #define NCI_EVT_HOT_PLUG 0x03 #define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01 #define NCI_HCI_ADM_CREATE_PIPE 0x10 #define NCI_HCI_ADM_DELETE_PIPE 0x11 /* HCP headers */ #define NCI_HCI_HCP_PACKET_HEADER_LEN 1 #define NCI_HCI_HCP_MESSAGE_HEADER_LEN 1 #define NCI_HCI_HCP_HEADER_LEN 2 /* HCP types */ #define NCI_HCI_HCP_COMMAND 0x00 #define NCI_HCI_HCP_EVENT 0x01 #define NCI_HCI_HCP_RESPONSE 0x02 #define NCI_HCI_ADM_NOTIFY_PIPE_CREATED 0x12 #define NCI_HCI_ADM_NOTIFY_PIPE_DELETED 0x13 #define NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED 0x15 #define NCI_HCI_FRAGMENT 0x7f #define NCI_HCP_HEADER(type, instr) ((((type) & 0x03) << 6) |\ ((instr) & 0x3f)) #define NCI_HCP_MSG_GET_TYPE(header) ((header & 0xc0) >> 6) #define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f) #define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f) static int nci_hci_result_to_errno(u8 result) { switch (result) { case NCI_HCI_ANY_OK: return 0; case NCI_HCI_ANY_E_REG_PAR_UNKNOWN: return -EOPNOTSUPP; case NCI_HCI_ANY_E_TIMEOUT: return -ETIME; default: return -1; } } /* HCI core */ static void nci_hci_reset_pipes(struct nci_hci_dev *hdev) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { hdev->pipes[i].gate = NCI_HCI_INVALID_GATE; hdev->pipes[i].host = NCI_HCI_INVALID_HOST; } memset(hdev->gate2pipe, NCI_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe)); } static void nci_hci_reset_pipes_per_host(struct nci_dev *ndev, u8 host) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { if (ndev->hci_dev->pipes[i].host == host) { ndev->hci_dev->pipes[i].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[i].host = NCI_HCI_INVALID_HOST; } } } /* Fragment HCI data over NCI packet. * NFC Forum NCI 10.2.2 Data Exchange: * The payload of the Data Packets sent on the Logical Connection SHALL be * valid HCP packets, as defined within [ETSI_102622]. Each Data Packet SHALL * contain a single HCP packet. NCI Segmentation and Reassembly SHALL NOT be * applied to Data Messages in either direction. The HCI fragmentation mechanism * is used if required. */ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; i = 0; skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 2); *(u8 *)skb_push(skb, 1) = data_type; do { /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { cb |= NCI_HFP_NO_CHAINING; len = data_len - i; } else { len = conn_info->max_pkt_payload_len - skb->len - 1; } *(u8 *)skb_push(skb, 1) = cb; if (len > 0) skb_put_data(skb, data + i, len); r = nci_send_data(ndev, conn_info->conn_id, skb); if (r < 0) return r; i += len; if (i < data_len) { skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 1); } } while (i < data_len); return i; } static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt) { const struct nci_data *data = opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); } int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event, const u8 *param, size_t param_len) { u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; return nci_hci_send_data(ndev, pipe, NCI_HCP_HEADER(NCI_HCI_HCP_EVENT, event), param, param_len); } EXPORT_SYMBOL(nci_hci_send_event); int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, cmd); data.data = param; data.data_len = param_len; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_send_cmd); int nci_hci_clear_all_pipes(struct nci_dev *ndev) { int r; r = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CLEAR_ALL_PIPE, NULL, 0, NULL); if (r < 0) return r; nci_hci_reset_pipes(ndev->hci_dev); return r; } EXPORT_SYMBOL(nci_hci_clear_all_pipes); static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe, u8 event, struct sk_buff *skb) { if (ndev->ops->hci_event_received) ndev->ops->hci_event_received(ndev, pipe, event, skb); } static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 gate = ndev->hci_dev->pipes[pipe].gate; u8 status = NCI_HCI_ANY_OK | ~NCI_HCI_FRAGMENT; u8 dest_gate, new_pipe; struct nci_hci_create_pipe_resp *create_info; struct nci_hci_delete_pipe_noti *delete_info; struct nci_hci_all_pipe_cleared_noti *cleared_info; pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); switch (cmd) { case NCI_HCI_ADM_NOTIFY_PIPE_CREATED: if (skb->len != 5) { status = NCI_HCI_ANY_E_NOK; goto exit; } create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; if (new_pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we * are the destination and it is our local gate */ ndev->hci_dev->gate2pipe[dest_gate] = new_pipe; ndev->hci_dev->pipes[new_pipe].gate = dest_gate; ndev->hci_dev->pipes[new_pipe].host = create_info->src_host; break; case NCI_HCI_ANY_OPEN_PIPE: /* If the pipe is not created report an error */ if (gate == NCI_HCI_INVALID_GATE) { status = NCI_HCI_ANY_E_NOK; goto exit; } break; case NCI_HCI_ADM_NOTIFY_PIPE_DELETED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[delete_info->pipe].host = NCI_HCI_INVALID_HOST; break; case NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } cleared_info = (struct nci_hci_all_pipe_cleared_noti *)skb->data; nci_hci_reset_pipes_per_host(ndev, cleared_info->host); break; default: pr_debug("Discarded unknown cmd %x to gate %x\n", cmd, gate); break; } if (ndev->ops->hci_cmd_received) ndev->ops->hci_cmd_received(ndev, pipe, cmd, skb); exit: nci_hci_send_data(ndev, pipe, status, NULL, 0); kfree_skb(skb); } static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) goto exit; conn_info->rx_skb = skb; exit: nci_req_complete(ndev, NCI_STATUS_OK); } /* Receive hcp message for pipe, with type and cmd. * skb contains optional message data only. */ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, u8 type, u8 instruction, struct sk_buff *skb) { switch (type) { case NCI_HCI_HCP_RESPONSE: nci_hci_resp_received(ndev, pipe, skb); break; case NCI_HCI_HCP_COMMAND: nci_hci_cmd_received(ndev, pipe, instruction, skb); break; case NCI_HCI_HCP_EVENT: nci_hci_event_received(ndev, pipe, instruction, skb); break; default: pr_err("UNKNOWN MSG Type %d, instruction=%d\n", type, instruction); kfree_skb(skb); break; } nci_req_complete(ndev, NCI_STATUS_OK); } static void nci_hci_msg_rx_work(struct work_struct *work) { struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; const struct nci_hcp_message *message; u8 pipe, type, instruction; for (; (skb = skb_dequeue(&hdev->msg_rx_queue)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]); skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN); message = (struct nci_hcp_message *)skb->data; type = NCI_HCP_MSG_GET_TYPE(message->header); instruction = NCI_HCP_MSG_GET_CMD(message->header); skb_pull(skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); nci_hci_hcp_message_rx(hdev->ndev, pipe, type, instruction, skb); } } void nci_hci_data_received_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; struct nci_hcp_packet *packet; u8 pipe, type; struct sk_buff *hcp_skb; struct sk_buff *frag_skb; int msg_len; if (err) { nci_req_complete(ndev, err); return; } packet = (struct nci_hcp_packet *)skb->data; if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) { skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); return; } /* it's the last fragment. Does it need re-aggregation? */ if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); msg_len = 0; skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len += (frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN); } hcp_skb = nfc_alloc_recv_skb(NCI_HCI_HCP_PACKET_HEADER_LEN + msg_len, GFP_KERNEL); if (!hcp_skb) { nci_req_complete(ndev, -ENOMEM); return; } skb_put_u8(hcp_skb, pipe); skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN; skb_put_data(hcp_skb, frag_skb->data + NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len); } skb_queue_purge(&ndev->hci_dev->rx_hcp_frags); } else { packet->header &= NCI_HCI_FRAGMENT; hcp_skb = skb; } /* if this is a response, dispatch immediately to * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN); nci_hci_hcp_message_rx(ndev, pipe, type, NCI_STATUS_OK, hcp_skb); } else { skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb); schedule_work(&ndev->hci_dev->msg_rx_work); } } int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_OPEN_PIPE); data.data = NULL; data.data_len = 0; return nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); } EXPORT_SYMBOL(nci_hci_open_pipe); static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, int *result) { u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); params.src_gate = NCI_HCI_ADMIN_GATE; params.dest_host = dest_host; params.dest_gate = dest_gate; *result = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CREATE_PIPE, (u8 *)¶ms, sizeof(params), &skb); if (*result < 0) return NCI_HCI_INVALID_PIPE; resp = (struct nci_hci_create_pipe_resp *)skb->data; pipe = resp->pipe; kfree_skb(skb); pr_debug("pipe created=%d\n", pipe); if (pipe >= NCI_HCI_MAX_PIPES) pipe = NCI_HCI_INVALID_PIPE; return pipe; } static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) { return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; tmp = kmalloc(1 + param_len, GFP_KERNEL); if (!tmp) return -ENOMEM; *tmp = idx; memcpy(tmp + 1, param, param_len); data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_SET_PARAMETER); data.data = tmp; data.data_len = param_len + 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); } kfree(tmp); return r; } EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_GET_PARAMETER); data.data = &idx; data.data_len = 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_get_param); int nci_hci_connect_gate(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, u8 pipe) { bool pipe_created = false; int r; if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE) return 0; if (ndev->hci_dev->gate2pipe[dest_gate] != NCI_HCI_INVALID_PIPE) return -EADDRINUSE; if (pipe != NCI_HCI_INVALID_PIPE) goto open_pipe; switch (dest_gate) { case NCI_HCI_LINK_MGMT_GATE: pipe = NCI_HCI_LINK_MGMT_PIPE; break; case NCI_HCI_ADMIN_GATE: pipe = NCI_HCI_ADMIN_PIPE; break; default: pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r); if (pipe == NCI_HCI_INVALID_PIPE) return r; pipe_created = true; break; } open_pipe: r = nci_hci_open_pipe(ndev, pipe); if (r < 0) { if (pipe_created) { if (nci_hci_delete_pipe(ndev, pipe) < 0) { /* TODO: Cannot clean by deleting pipe... * -> inconsistent state */ } } return r; } ndev->hci_dev->pipes[pipe].gate = dest_gate; ndev->hci_dev->pipes[pipe].host = dest_host; ndev->hci_dev->gate2pipe[dest_gate] = pipe; return 0; } EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, const struct nci_hci_gate *gates) { int r; while (gate_count--) { r = nci_hci_connect_gate(ndev, gates->dest_host, gates->gate, gates->pipe); if (r < 0) return r; gates++; } return 0; } int nci_hci_dev_session_init(struct nci_dev *ndev) { struct nci_conn_info *conn_info; struct sk_buff *skb; int r; ndev->hci_dev->count_pipes = 0; ndev->hci_dev->expected_pipes = 0; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; conn_info->data_exchange_cb = nci_hci_data_received_cb; conn_info->data_exchange_cb_context = ndev; nci_hci_reset_pipes(ndev->hci_dev); if (ndev->hci_dev->init_data.gates[0].gate != NCI_HCI_ADMIN_GATE) return -EPROTO; r = nci_hci_connect_gate(ndev, ndev->hci_dev->init_data.gates[0].dest_host, ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); } else { r = nci_hci_clear_all_pipes(ndev); if (r < 0) goto exit; r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, ndev->hci_dev->init_data.gates); if (r < 0) goto exit; r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } exit: kfree_skb(skb); return r; } EXPORT_SYMBOL(nci_hci_dev_session_init); struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev) { struct nci_hci_dev *hdev; hdev = kzalloc_obj(*hdev); if (!hdev) return NULL; skb_queue_head_init(&hdev->rx_hcp_frags); INIT_WORK(&hdev->msg_rx_work, nci_hci_msg_rx_work); skb_queue_head_init(&hdev->msg_rx_queue); hdev->ndev = ndev; return hdev; } void nci_hci_deallocate(struct nci_dev *ndev) { kfree(ndev->hci_dev); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H #include <linux/uio.h> #include <linux/types.h> #include <linux/mmzone.h> #include <linux/page-flags.h> #include <linux/slab.h> #include <linux/crc32.h> #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/sched/mm.h> #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/part_stat.h> #include <linux/rw_hint.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else #define f2fs_bug_on(sbi, condition) \ do { \ if (WARN_ON(condition)) \ set_sbi_flag(sbi, SBI_NEED_FSCK); \ } while (0) #endif enum { FAULT_KMALLOC, FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, FAULT_TRUNCATE, FAULT_READ_IO, FAULT_CHECKPOINT, FAULT_DISCARD, /* it's obsolete due to __blkdev_issue_discard() will never fail */ FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, FAULT_BLKADDR_VALIDITY, FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, FAULT_INCONSISTENT_FOOTER, FAULT_ATOMIC_TIMEOUT, FAULT_VMALLOC, FAULT_LOCK_TIMEOUT, FAULT_SKIP_WRITE, FAULT_MAX, }; /* indicate which option to update */ enum fault_option { FAULT_RATE = 1, /* only update fault rate */ FAULT_TYPE = 2, /* only update fault type */ FAULT_TIMEOUT = 4, /* only update fault timeout type */ FAULT_ALL = 8, /* reset all fault injection options/stats */ }; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info { atomic_t inject_ops; int inject_rate; unsigned int inject_type; /* Used to account total count of injection for each type */ unsigned int inject_count[FAULT_MAX]; unsigned int inject_lock_timeout; /* inject lock timeout */ }; extern const char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) /* maximum retry count for injected failure */ #define DEFAULT_FAILURE_RETRY_COUNT 8 #else #define DEFAULT_FAILURE_RETRY_COUNT 1 #endif /* * For mount options */ enum f2fs_mount_opt { F2FS_MOUNT_DISABLE_ROLL_FORWARD, F2FS_MOUNT_DISCARD, F2FS_MOUNT_NOHEAP, F2FS_MOUNT_XATTR_USER, F2FS_MOUNT_POSIX_ACL, F2FS_MOUNT_DISABLE_EXT_IDENTIFY, F2FS_MOUNT_INLINE_XATTR, F2FS_MOUNT_INLINE_DATA, F2FS_MOUNT_INLINE_DENTRY, F2FS_MOUNT_FLUSH_MERGE, F2FS_MOUNT_NOBARRIER, F2FS_MOUNT_FASTBOOT, F2FS_MOUNT_READ_EXTENT_CACHE, F2FS_MOUNT_DATA_FLUSH, F2FS_MOUNT_FAULT_INJECTION, F2FS_MOUNT_USRQUOTA, F2FS_MOUNT_GRPQUOTA, F2FS_MOUNT_PRJQUOTA, F2FS_MOUNT_QUOTA, F2FS_MOUNT_INLINE_XATTR_SIZE, F2FS_MOUNT_RESERVE_ROOT, F2FS_MOUNT_DISABLE_CHECKPOINT, F2FS_MOUNT_NORECOVERY, F2FS_MOUNT_ATGC, F2FS_MOUNT_MERGE_CHECKPOINT, F2FS_MOUNT_GC_MERGE, F2FS_MOUNT_COMPRESS_CACHE, F2FS_MOUNT_AGE_EXTENT_CACHE, F2FS_MOUNT_NAT_BITS, F2FS_MOUNT_INLINECRYPT, /* * Some f2fs environments expect to be able to pass the "lazytime" option * string rather than using the MS_LAZYTIME flag, so this must remain. */ F2FS_MOUNT_LAZYTIME, F2FS_MOUNT_RESERVE_NODE, }; #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) \ (F2FS_OPTION(sbi).opt &= ~BIT(F2FS_MOUNT_##option)) #define set_opt(sbi, option) \ (F2FS_OPTION(sbi).opt |= BIT(F2FS_MOUNT_##option)) #define test_opt(sbi, option) \ (F2FS_OPTION(sbi).opt & BIT(F2FS_MOUNT_##option)) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ ((long long)((a) - (b)) > 0)) typedef u32 block_t; /* * should not change u32, since it is the on-disk block * address format, __le32. */ typedef u32 nid_t; #define COMPRESS_EXT_NUM 16 enum blkzone_allocation_policy { BLKZONE_ALLOC_PRIOR_SEQ, /* Prioritize writing to sequential zones */ BLKZONE_ALLOC_ONLY_SEQ, /* Only allow writing to sequential zones */ BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; enum bggc_io_aware_policy { AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ AWARE_READ_IO, /* skip background GC if there is pending read IO */ AWARE_NONE, /* don't aware IO for background GC */ }; enum device_allocation_policy { ALLOCATE_FORWARD_NOHINT, ALLOCATE_FORWARD_WITHIN_HINT, ALLOCATE_FORWARD_FROM_HINT, }; enum f2fs_lock_name { LOCK_NAME_NONE, LOCK_NAME_CP_RWSEM, LOCK_NAME_NODE_CHANGE, LOCK_NAME_NODE_WRITE, LOCK_NAME_GC_LOCK, LOCK_NAME_CP_GLOBAL, LOCK_NAME_IO_RWSEM, LOCK_NAME_MAX, }; enum f2fs_timeout_type { TIMEOUT_TYPE_NONE, TIMEOUT_TYPE_RUNNING, TIMEOUT_TYPE_IO_SLEEP, TIMEOUT_TYPE_NONIO_SLEEP, TIMEOUT_TYPE_RUNNABLE, TIMEOUT_TYPE_MAX, }; /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock * while sleeping on the write lock but the write lock is needed by * higher-priority clients. */ struct f2fs_rwsem { struct f2fs_sb_info *sbi; enum f2fs_lock_name name; struct rw_semaphore internal_rwsem; #ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_queue_head_t read_waiters; #endif }; struct f2fs_mount_info { unsigned long long opt; block_t root_reserved_blocks; /* root reserved blocks */ block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ int active_logs; /* # of active logs */ int inline_xattr_size; /* inline xattr size */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; /* For fault injection */ #endif #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ int memory_mode; /* memory mode */ int errors; /* errors parameter */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, * segment or section */ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ unsigned char nocompress_ext_cnt; /* nocompress extension count */ int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned int lookup_mode; }; #define F2FS_FEATURE_ENCRYPT 0x00000001 #define F2FS_FEATURE_BLKZONED 0x00000002 #define F2FS_FEATURE_ATOMIC_WRITE 0x00000004 #define F2FS_FEATURE_EXTRA_ATTR 0x00000008 #define F2FS_FEATURE_PRJQUOTA 0x00000010 #define F2FS_FEATURE_INODE_CHKSUM 0x00000020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x00000040 #define F2FS_FEATURE_QUOTA_INO 0x00000080 #define F2FS_FEATURE_INODE_CRTIME 0x00000100 #define F2FS_FEATURE_LOST_FOUND 0x00000200 #define F2FS_FEATURE_VERITY 0x00000400 #define F2FS_FEATURE_SB_CHKSUM 0x00000800 #define F2FS_FEATURE_CASEFOLD 0x00001000 #define F2FS_FEATURE_COMPRESSION 0x00002000 #define F2FS_FEATURE_RO 0x00004000 #define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 #define F2FS_FEATURE_PACKED_SSA 0x00010000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) /* * Default values for user and/or group using reserved blocks */ #define F2FS_DEF_RESUID 0 #define F2FS_DEF_RESGID 0 /* * For checkpoint manager */ enum { NAT_BITMAP, SIT_BITMAP }; #define CP_UMOUNT 0x00000001 #define CP_FASTBOOT 0x00000002 #define CP_SYNC 0x00000004 #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 #define CP_PAUSE 0x00000040 #define CP_RESIZE 0x00000080 #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ enum cp_time { CP_TIME_START, /* begin */ CP_TIME_LOCK, /* after cp_global_sem */ CP_TIME_OP_LOCK, /* after block_operation */ CP_TIME_MERGE_WRITE, /* after flush DATA/NODE/META */ CP_TIME_FLUSH_NAT, /* after flush nat */ CP_TIME_FLUSH_SIT, /* after flush sit */ CP_TIME_SYNC_META, /* after sync_meta_pages */ CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ CP_TIME_FLUSH_DEVICE, /* after flush device cache */ CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ CP_TIME_END, /* after unblock_operation */ CP_TIME_MAX, }; /* time cost stats of checkpoint */ struct cp_stats { ktime_t times[CP_TIME_MAX]; }; struct cp_control { int reason; __u64 trim_start; __u64 trim_end; __u64 trim_minlen; struct cp_stats stats; }; enum f2fs_cp_phase { CP_PHASE_START_BLOCK_OPS, CP_PHASE_FINISH_BLOCK_OPS, CP_PHASE_FINISH_CHECKPOINT, }; /* * indicate meta/data type */ enum { META_CP, META_NAT, META_SIT, META_SSA, META_MAX, META_POR, DATA_GENERIC, /* check range only */ DATA_GENERIC_ENHANCE, /* strong check on range and segment bitmap */ DATA_GENERIC_ENHANCE_READ, /* * strong check on range and segment * bitmap but no warning due to race * condition of read on truncated area * by extent_cache */ DATA_GENERIC_ENHANCE_UPDATE, /* * strong check on range and segment * bitmap for update case */ META_GENERIC, }; /* for the list of ino */ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ TRANS_DIR_INO, /* for transactions dir ino list */ XATTR_DIR_INO, /* for xattr updated dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; struct ino_entry { struct list_head list; /* list head */ nid_t ino; /* inode number */ unsigned int dirty_device; /* dirty device bitmap */ }; /* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ }; struct fsync_node_entry { struct list_head list; /* list head */ struct folio *folio; /* warm node folio pointer */ unsigned int seq_id; /* sequence id */ }; struct ckpt_req { struct completion wait; /* completion for checkpoint done */ struct llist_node llnode; /* llist_node to be linked in wait queue */ int ret; /* return code of checkpoint */ union { ktime_t queue_time; /* request queued time */ ktime_t delta_time; /* time in queue */ }; }; struct ckpt_req_control { struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ atomic_t issued_ckpt; /* # of actually issued ckpts */ atomic_t total_ckpt; /* # of total ckpts */ atomic_t queued_ckpt; /* # of queued ckpts */ struct llist_head issue_list; /* list for command issue */ spinlock_t stat_lock; /* lock for below checkpoint time stats */ unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ unsigned int peak_time; /* peak wait time in msec until now */ }; /* a time threshold that checkpoint was blocked for, unit: ms */ #define CP_LONG_LATENCY_THRESHOLD 5000 /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ block_t start_blkaddr; /* start blockaddr of current segment */ unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; /* minimum discard granularity, unit: block count */ #define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 /* default maximum discard granularity of ordered discard, unit: block count */ #define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* default interval of periodical discard submission */ #define DEFAULT_DISCARD_INTERVAL (msecs_to_jiffies(20)) /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ D_PARTIAL, /* partially submitted */ D_SUBMIT, /* all submitted */ D_DONE, /* finished */ }; struct discard_info { block_t lstart; /* logical start address */ block_t len; /* length */ block_t start; /* actual start address in dev */ }; struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ struct discard_info di; /* discard info */ struct list_head list; /* command list */ struct completion wait; /* completion */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ unsigned char queued; /* queued discard */ int error; /* bio error */ spinlock_t lock; /* for state/bio_ref updating */ unsigned short bio_ref; /* bio reference count */ }; enum { DPOLICY_BG, DPOLICY_FORCE, DPOLICY_FSTRIM, DPOLICY_UMOUNT, MAX_DPOLICY, }; enum { DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */ DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */ DPOLICY_IO_AWARE_MAX, }; struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ unsigned int mid_interval; /* used for device busy */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ bool timeout; /* discard timeout for put_super */ unsigned int granularity; /* discard granularity */ }; struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ unsigned int max_discard_request; /* max. discard request per round */ unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_io_aware_gran; /* minimum discard granularity not be aware of I/O */ unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int discard_io_aware; /* io_aware policy */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ atomic_t queued_discard; /* # of queued discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ bool discard_wake; /* to wake up discard thread */ }; /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ }; #define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) #define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) #define nat_in_journal(jnl, i) \ (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].ne) #define nid_in_journal(jnl, i) \ (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].nid) #define sit_in_journal(jnl, i) \ (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].se) #define segno_in_journal(jnl, i) \ (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].segno) #define sum_entries(sum) ((struct f2fs_summary *)(sum)) #define sum_journal(sbi, sum) \ ((struct f2fs_journal *)((char *)(sum) + \ ((sbi)->entries_in_sum * sizeof(struct f2fs_summary)))) #define sum_footer(sbi, sum) \ ((struct summary_footer *)((char *)(sum) + (sbi)->sum_blocksize - \ sizeof(struct summary_footer))) #define MAX_NAT_JENTRIES(sbi, jnl) ((sbi)->nat_journal_entries - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(sbi, jnl) ((sbi)->sit_journal_entries - sits_in_cursum(jnl)) static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); journal->n_nats = cpu_to_le16(before + i); return before; } static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); journal->n_sits = cpu_to_le16(before + i); return before; } /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ (CUR_ADDRS_PER_INODE(inode) - \ get_inline_xattr_addrs(inode) - \ DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ #define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ BITS_PER_BYTE + 1)) #define INLINE_DENTRY_BITMAP_SIZE(inode) \ DIV_ROUND_UP(NR_INLINE_DENTRY(inode), BITS_PER_BYTE) #define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ NR_INLINE_DENTRY(inode) + \ INLINE_DENTRY_BITMAP_SIZE(inode))) /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_filename { /* * The filename the user specified. This is NULL for some * filesystem-internal operations, e.g. converting an inline directory * to a non-inline one, or roll-forward recovering an encrypted dentry. */ const struct qstr *usr_fname; /* * The on-disk filename. For encrypted directories, this is encrypted. * This may be NULL for lookups in an encrypted dir without the key. */ struct fscrypt_str disk_name; /* The dirhash of this filename */ f2fs_hash_t hash; #ifdef CONFIG_FS_ENCRYPTION /* * For lookups in encrypted directories: either the buffer backing * disk_name, or a buffer that holds the decoded no-key name. */ struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL * if the original name is not valid Unicode, if the original name is * "." or "..", if the directory is both casefolded and encrypted and * its encryption key is unavailable, or if the filesystem is doing an * internal operation where usr_fname is also NULL. In all these cases * we fall back to treating the name as an opaque byte sequence. */ struct qstr cf_name; #endif }; struct f2fs_dentry_ptr { struct inode *inode; void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } static inline void make_dentry_ptr_inline(struct inode *inode, struct f2fs_dentry_ptr *d, void *t) { int entry_cnt = NR_INLINE_DENTRY(inode); int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); int reserved_size = INLINE_RESERVED_SIZE(inode); d->inode = inode; d->max = entry_cnt; d->nr_bitmap = bitmap_size; d->bitmap = t; d->dentry = t + bitmap_size + reserved_size; d->filename = t + bitmap_size + reserved_size + SIZE_OF_DIR_ENTRY * entry_cnt; } /* * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 * as its node offset to distinguish from index node blocks. * But some bits are used to mark the node block. */ #define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ >> OFFSET_BIT_SHIFT) enum { ALLOC_NODE, /* allocate a new node page if needed */ LOOKUP_NODE, /* look up a node without readahead */ LOOKUP_NODE_RA, /* * look up a node with readahead called * by get_data_block. */ }; #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ #define MAX_FLUSH_RETRY_COUNT 3 /* maximum flush retry count in f2fs_enable_checkpoint() */ /* IO/non-IO congestion wait timeout value, default: 1 jiffies */ #define DEFAULT_SCHEDULE_TIMEOUT 1 /* timeout value injected, default: 1000ms */ #define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 /* maximum retry of EIO'ed page */ #define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 #define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS #define RECOVERY_MIN_RA_BLOCKS 1 #define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 /* number of age extent info in extent cache we try to shrink */ #define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 #define LAST_AGE_WEIGHT 30 #define SAME_AGE_REGION 1024 /* * Define data block with age less than 1GB as hot data * define data block with age less than 10GB but more than 1GB as warm data */ #define DEF_HOT_DATA_AGE_THRESHOLD 262144 #define DEF_WARM_DATA_AGE_THRESHOLD 2621440 /* default max read extent count per inode */ #define DEF_MAX_READ_EXTENT_COUNT 10240 /* extent cache type */ enum extent_type { EX_READ, EX_BLOCK_AGE, NR_EXTENT_CACHES, }; /* * Reserved value to mark invalid age extents, hence valid block range * from 0 to ULLONG_MAX-1 */ #define F2FS_EXTENT_AGE_INVALID ULLONG_MAX struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ union { /* read extent_cache */ struct { /* start block address of the extent */ block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION /* physical extent length of compressed blocks */ unsigned int c_len; #endif }; /* block age extent_cache */ struct { /* block age of the extent */ unsigned long long age; /* last total blocks allocated */ unsigned long long last_blocks; }; }; }; struct extent_node { struct rb_node rb_node; /* rb node located in rb-tree */ struct extent_info ei; /* extent info */ struct list_head list; /* node in global extent list of sbi */ struct extent_tree *et; /* extent tree pointer */ }; struct extent_tree { nid_t ino; /* inode number */ enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ struct extent_info largest; /* largest cached extent for EX_READ */ }; struct extent_tree_info { struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ struct list_head zombie_list; /* extent zombie tree list */ atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ }; /* * State of block returned by f2fs_map_blocks. */ #define F2FS_MAP_NEW (1U << 0) #define F2FS_MAP_MAPPED (1U << 1) #define F2FS_MAP_DELALLOC (1U << 2) #define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ F2FS_MAP_DELALLOC) struct f2fs_map_blocks { struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; unsigned int m_flags; unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ enum { F2FS_GET_BLOCK_DEFAULT, F2FS_GET_BLOCK_FIEMAP, F2FS_GET_BLOCK_BMAP, F2FS_GET_BLOCK_DIO, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, F2FS_GET_BLOCK_PRECACHE, }; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 #define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) #define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) #define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) #define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) #define DEF_DIR_LEVEL 0 /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ FI_INLINE_DENTRY, /* used for inline dentry */ FI_APPEND_WRITE, /* inode has appended data */ FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ FI_DATA_EXIST, /* indicate data exists */ FI_SKIP_WRITES, /* should skip data page writeback */ FI_OPU_WRITE, /* used for opu per file */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ FI_ATOMIC_DIRTIED, /* indicate atomic file is dirtied */ FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_OPENED_FILE, /* indicate file has been opened */ FI_DONATE_FINISHED, /* indicate page donation of file has been finished */ FI_MAX, /* max flag, never be used */ }; struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ union { unsigned int i_current_depth; /* only for directory depth */ unsigned short i_gc_failures; /* for gc failure statistic */ }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ unsigned int ioprio_hint; /* hint for IO priority */ struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ #ifdef CONFIG_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ /* linked in global inode list for cache donation */ struct list_head gdonate_list; pgoff_t donate_start, donate_end; /* inclusive */ atomic_t open_count; /* # of open files */ struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; /* cached extent_tree entry */ union { struct inode *cow_inode; /* copy-on-write inode for atomic write */ struct inode *atomic_inode; /* point to atomic_inode, available only for cow_inode */ }; /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ struct timespec64 i_disk_time[3];/* inode disk times */ /* for file compress */ atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned char i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ atomic_t writeback; /* count # of writeback thread */ unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ #endif }; static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); ext->blk = le32_to_cpu(i_ext->blk); ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); i_ext->blk = cpu_to_le32(ext->blk); i_ext->len = cpu_to_le32(ext->len); } static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { return (back->lstart + back->len == front->lstart) && (back->len + front->len <= max_len); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, struct discard_info *back, unsigned int max_len) { return __is_discard_mergeable(back, cur, max_len); } static inline bool __is_discard_front_mergeable(struct discard_info *cur, struct discard_info *front, unsigned int max_len) { return __is_discard_mergeable(cur, front, max_len); } /* * For free nid management */ enum nid_state { FREE_NID, /* newly added to free nid list */ PREALLOC_NID, /* it is preallocated */ MAX_NID_STATE, }; enum nat_state { TOTAL_NAT, DIRTY_NAT, RECLAIMABLE_NAT, MAX_NAT_STATE, }; struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ nid_t max_rf_node_blocks; /* max # of nodes for recovery */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ unsigned char **free_nid_bitmap; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ unsigned int nat_bits_blocks; /* # of nat bits blocks */ unsigned char *nat_bits; /* NAT bits blocks */ unsigned char *full_nat_bits; /* full NAT pages */ unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif int bitmap_size; /* bitmap size */ }; /* * this structure is used as one of function parameters. * all the information are dedicated to a given direct node block determined * by the data offset in a file. */ struct dnode_of_data { struct inode *inode; /* vfs inode pointer */ struct folio *inode_folio; /* its inode folio, NULL is possible */ struct folio *node_folio; /* cached direct node folio */ nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_folio_locked; /* inode folio is locked or not */ bool node_changed; /* is node block changed */ char cur_level; /* level of hole node page */ char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct folio *ifolio, struct folio *nfolio, nid_t nid) { memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_folio = ifolio; dn->node_folio = nfolio; dn->nid = nid; } /* * For SIT manager * * By default, there are 6 active log areas across the whole main area. * When considering hot and cold data separation to reduce cleaning overhead, * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, * respectively. * In the current design, you should not change the numbers intentionally. * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 * logs individually according to the underlying devices. (default: 6) * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for * data and 8 for node logs. */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) #define NR_CURSEG_INMEM_TYPE (2) #define NR_CURSEG_RO_TYPE (2) #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) enum log_type { CURSEG_HOT_DATA = 0, /* directory entry blocks */ CURSEG_WARM_DATA, /* data blocks */ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NR_PERSISTENT_LOG, /* number of persistent log */ CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, /* pinned file that needs consecutive block address */ CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ NO_CHECK_TYPE, /* number of persistent & inmem log */ }; struct flush_cmd { struct completion wait; struct llist_node llnode; nid_t ino; int ret; }; struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ atomic_t issued_flush; /* # of issued flushes */ atomic_t queued_flush; /* # of queued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ unsigned int min_seq_blocks; /* threshold for sequential blocks */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; /* for discard command control */ struct discard_cmd_control *dcc_info; }; /* * For superblock */ /* * COUNT_TYPE for monitoring * * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ #define WB_DATA_TYPE(folio, f) \ (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, F2FS_RD_DATA, F2FS_RD_NODE, F2FS_RD_META, F2FS_DIO_WRITE, F2FS_DIO_READ, F2FS_SKIPPED_WRITE, /* skip or fail during f2fs_enable_checkpoint() */ NR_COUNT_TYPE, }; /* * The below are the page types of bios used in submit_bio(). * The available types are: * DATA User data pages. It operates as async mode. * NODE Node pages. It operates as async mode. * META FS metadata pages such as SIT, NAT, CP. * NR_PAGE_TYPE The number of page types. * META_FLUSH Make sure the previous pages are written * with waiting the bio's completion * ... Only can be used with META. */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) #define PAGE_TYPE_ON_MAIN(type) ((type) == DATA || (type) == NODE) enum page_type { DATA = 0, NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, IPU, /* the below types are used by tracepoints only. */ OPU, }; enum temp_type { HOT = 0, /* must be zero for meta bio */ WARM, COLD, NR_TEMP_TYPE, }; enum need_lock_type { LOCK_REQ = 0, LOCK_DONE, LOCK_RETRY, }; enum cp_reason_type { CP_NO_NEEDED, CP_NON_REGULAR, CP_COMPRESSED, CP_HARDLINK, CP_SB_NEED_CP, CP_WRONG_PINO, CP_NO_SPC_ROLL, CP_NODE_NEED_CP, CP_FASTBOOT_MODE, CP_SPEC_LOG_NUM, CP_RECOVER_DIR, CP_XATTR_DIR, }; enum iostat_type { /* WRITE IO */ APP_DIRECT_IO, /* app direct write IOs */ APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ FS_GC_NODE_IO, /* node IOs from forground gc */ FS_CP_DATA_IO, /* data IOs from checkpoint */ FS_CP_NODE_IO, /* node IOs from checkpoint */ FS_CP_META_IO, /* meta IOs from checkpoint */ /* READ IO */ APP_DIRECT_READ_IO, /* app direct read IOs */ APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ FS_NODE_READ_IO, /* node read IOs */ FS_META_READ_IO, /* meta read IOs */ /* other */ FS_DISCARD_IO, /* discard */ FS_FLUSH_IO, /* flush */ FS_ZONE_RESET_IO, /* zone reset */ NR_IO_TYPE, }; struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ enum req_op op; /* contains REQ_OP_ */ blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ union { struct page *page; /* page to be written */ struct folio *folio; }; struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ unsigned int compr_blocks; /* # of compressed block addresses */ unsigned int need_lock:8; /* indicate we need to lock cp_rwsem */ unsigned int version:8; /* version of the node */ unsigned int submitted:1; /* indicate IO submission */ unsigned int in_list:1; /* indicate fio is in io_list */ unsigned int is_por:1; /* indicate IO is from recovery or not */ unsigned int encrypted:1; /* indicate file is encrypted */ unsigned int meta_gc:1; /* require meta inode GC */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ sector_t *last_block; /* last block number in bio */ }; struct bio_entry { struct bio *bio; struct list_head list; }; #define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ #ifdef CONFIG_BLK_DEV_ZONED struct completion zone_wait; /* condition value for the previous open zone to close */ struct bio *zone_pending_bio; /* pending bio for the previous zone */ void *bi_private; /* previous bi_private for pending bio */ #endif struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ struct list_head bio_list; /* bio entry list head */ struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) #define RDEV(i) (raw_super->devs[i]) struct f2fs_dev_info { struct file *bdev_file; struct block_device *bdev; char path[MAX_PATH_LEN + 1]; unsigned int total_segments; block_t start_blk; block_t end_blk; #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_blkz; /* Total number of zones */ unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ #endif }; enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ DONATE_INODE, /* for all inode to donate pages */ NR_INODE_TYPE, }; /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ spinlock_t ino_lock; /* for ino entry lock */ struct list_head ino_list; /* inode list head */ unsigned long ino_num; /* number of entries */ }; /* for GC_AT */ struct atgc_management { bool atgc_enabled; /* ATGC is enabled or not */ struct rb_root_cached root; /* root of victim rb-tree */ struct list_head victim_list; /* linked with all victim entries */ unsigned int victim_count; /* victim count in rb-tree */ unsigned int candidate_ratio; /* candidate ratio */ unsigned int max_candidate_count; /* max candidate count */ unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ unsigned long long age_threshold; /* age threshold */ }; struct f2fs_time_stat { unsigned long long total_time; /* total wall clock time */ #ifdef CONFIG_64BIT unsigned long long running_time; /* running time */ #endif #if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) unsigned long long runnable_time; /* runnable(including preempted) time */ #endif #ifdef CONFIG_TASK_DELAY_ACCT unsigned long long io_sleep_time; /* IO sleep time */ #endif }; struct f2fs_lock_context { struct f2fs_time_stat ts; int orig_nice; int new_nice; bool lock_trace; bool need_restore; }; struct f2fs_gc_control { unsigned int victim_segno; /* target victim segment number */ int init_gc_type; /* FG_GC or BG_GC */ bool no_bg_gc; /* check the space and stop bg_gc */ bool should_migrate_blocks; /* should migrate blocks */ bool err_gc_skipped; /* return EAGAIN if GC skipped */ bool one_time; /* require one time GC in one migration unit */ unsigned int nr_free_secs; /* # of free sections to do GC */ struct f2fs_lock_context lc; /* lock context for gc_lock */ }; /* * For s_flag in struct f2fs_sb_info * Modification on enum should be synchronized with s_flag array */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ SBI_IS_WRITABLE, /* remove ro mountoption transiently */ SBI_ENABLE_CHECKPOINT, /* indicate it's during f2fs_enable_checkpoint() */ MAX_SBI_FLAG, }; enum { CP_TIME, REQ_TIME, DISCARD_TIME, GC_TIME, DISABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; /* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, GC_URGENT_MID, MAX_GC_MODE, }; enum { BGGC_MODE_ON, /* background gc is on */ BGGC_MODE_OFF, /* background gc is off */ BGGC_MODE_SYNC, /* * background gc is on, migrating blocks * like foreground gc */ }; enum { FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ FS_MODE_LFS, /* use lfs allocation only */ FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; enum fsync_mode { FSYNC_MODE_POSIX, /* fsync follows posix semantics */ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; enum { COMPR_MODE_FS, /* * automatically compress compression * enabled files */ COMPR_MODE_USER, /* * automatical compression is disabled. * user can control the file compression * using ioctls */ }; enum { DISCARD_UNIT_BLOCK, /* basic discard unit is block */ DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ DISCARD_UNIT_SECTION, /* basic discard unit is section */ }; enum { MEMORY_MODE_NORMAL, /* memory mode for normal devices */ MEMORY_MODE_LOW, /* memory mode for low memory devices */ }; enum errors_option { MOUNT_ERRORS_READONLY, /* remount fs ro on errors */ MOUNT_ERRORS_CONTINUE, /* continue on errors */ MOUNT_ERRORS_PANIC, /* panic on errors */ }; enum { BACKGROUND, FOREGROUND, MAX_CALL_TYPE, TOTAL_CALL = FOREGROUND, }; enum f2fs_lookup_mode { LOOKUP_PERF, LOOKUP_COMPAT, LOOKUP_AUTO, }; /* For node type in __get_node_folio() */ enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, NODE_TYPE_XATTR, NODE_TYPE_NON_INODE, }; /* a threshold of maximum elapsed time in critical region to print tracepoint */ #define MAX_LOCK_ELAPSED_TIME 500 #define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO) #define F2FS_CRITICAL_TASK_PRIORITY NICE_TO_PRIO(0) static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); /* * Layout of f2fs page.private: * * Layout A: lowest bit should be 1 * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | * bit 0 PAGE_PRIVATE_NOT_POINTER * bit 1 PAGE_PRIVATE_ONGOING_MIGRATION * bit 2 PAGE_PRIVATE_INLINE_INODE * bit 3 PAGE_PRIVATE_REF_RESOURCE * bit 4 PAGE_PRIVATE_ATOMIC_WRITE * bit 5- f2fs private data * * Layout B: lowest bit should be 0 * page.private is a wrapped pointer. */ enum { PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ PAGE_PRIVATE_MAX }; /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, COMPRESS_LZ4, COMPRESS_ZSTD, COMPRESS_LZORLE, COMPRESS_MAX, }; enum compress_flag { COMPRESS_CHKSUM, COMPRESS_MAX_FLAG, }; #define COMPRESS_WATERMARK 20 #define COMPRESS_PERCENT 20 #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ __le32 chksum; /* compressed data checksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; #define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 #define F2FS_ZSTD_DEFAULT_CLEVEL 1 #define COMPRESS_LEVEL_OFFSET 8 /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ void *private; /* payload buffer for specified compression algorithm */ void *private2; /* extra payload buffer */ struct fsverity_info *vi; /* verity info if needed */ }; /* compress context for write IO path */ struct compress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ atomic_t pending_pages; /* in-flight compressed page count */ }; /* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ struct page **tpages; /* temp pages to pad holes in cluster */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ /* * The number of compressed pages remaining to be read in this cluster. * This is initially nr_cpages. It is decremented by 1 each time a page * has been read (or failed to be read). When it reaches 0, the cluster * is decompressed (or an error is reported). * * If an error occurs before all the pages have been submitted for I/O, * then this will never reach 0. In this case the I/O submitter is * responsible for calling f2fs_decompress_end_io() instead. */ atomic_t remaining_pages; /* * Number of references to this decompress_io_ctx. * * One reference is held for I/O completion. This reference is dropped * after the pagecache pages are updated and unlocked -- either after * decompression (and verity if enabled), or after an error. * * In addition, each compressed page holds a reference while it is in a * bio. These references are necessary prevent compressed pages from * being freed while they are still in a bio. */ refcount_t refcnt; bool failed; /* IO error occurred before decompression? */ struct fsverity_info *vi; /* fs-verity context if needed */ unsigned char compress_algorithm; /* backup algorithm type */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 #define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ struct f2fs_rwsem sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int max_open_zones; /* max open zone resources of the zoned device */ /* For adjust the priority writing position of data in zone UFS */ unsigned int blkzone_alloc_policy; #endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ struct f2fs_rwsem node_write; /* locking node writes */ struct f2fs_rwsem node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct cp_stats cp_stats; /* for time stat of checkpoint */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ spinlock_t fsync_node_lock; /* for node entry lock */ struct list_head fsync_node_list; /* node list head */ unsigned int fsync_seg_id; /* sequence id */ unsigned int fsync_node_num; /* number of node entries */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ /* for inode management */ struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; atomic64_t allocated_data_blocks; /* for block age extent_cache */ unsigned int max_read_extent_count; /* max read extent count per inode */ /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; unsigned int warm_data_age_threshold; unsigned int last_age_weight; /* control donate caches */ unsigned int donate_files; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ unsigned int blocksize; /* block size */ unsigned int root_ino_num; /* root inode number*/ unsigned int node_ino_num; /* node inode number*/ unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ bool readdir_ra; /* readahead inode in readdir */ unsigned int max_io_bytes; /* max io bytes to merge IOs */ /* variable summary block units */ unsigned int sum_blocksize; /* sum block size */ unsigned int sums_per_block; /* sum block count per block */ unsigned int entries_in_sum; /* entry count in sum block */ unsigned int sum_entry_size; /* total entry size in sum block */ unsigned int sum_journal_size; /* journal size in sum block */ unsigned int nat_journal_entries; /* nat journal entry count in the journal */ unsigned int sit_journal_entries; /* sit journal entry count in the journal */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ /* Additional tracking for no checkpoint mode */ block_t unusable_block_count; /* # of blocks saved by last cp */ unsigned int nquota_files; /* # of quota sysfile */ struct f2fs_rwsem quota_sem; /* blocking cp for flags */ struct task_struct *umount_lock_holder; /* s_umount lock holder */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; /* # of node block writes as roll forward recovery */ struct percpu_counter rf_node_block_count; /* writeback control */ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ /* valid inode count */ struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ struct f2fs_rwsem gc_lock; /* * semaphore for GC, avoid * race between GC and GC or CP */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ spinlock_t gc_remaining_trials_lock; /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ unsigned int gc_remaining_trials; /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* free sections reserved for pinned file */ unsigned int reserved_pin_section; /* threshold for gc trials on pinned files */ unsigned short gc_pin_file_threshold; struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; /* migration granularity of garbage collection, unit: segment */ unsigned int migration_granularity; /* migration window granularity of garbage collection, unit: segment */ unsigned int migration_window_granularity; /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. */ #ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ atomic_t meta_count[META_MAX]; /* # of meta blocks */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ /* # of lookup extent cache */ atomic64_t total_hit_ext[NR_EXTENT_CACHES]; /* # of hit rbtree extent node */ atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; /* # of hit cached extent node */ atomic64_t read_hit_cached[NR_EXTENT_CACHES]; /* # of hit largest extent node in read extent cache */ atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t swapfile_inode; /* # of swapfile inodes */ atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ atomic_t cp_call_count[MAX_CALL_TYPE]; /* # of cp call */ #endif spinlock_t stat_lock; /* lock for stat operations */ /* to attach REQ_META|REQ_FUA flags */ unsigned int data_io_flag; unsigned int node_io_flag; /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ struct completion s_kobj_unregister; struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ struct completion s_stat_kobj_unregister; struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ struct completion s_feature_list_kobj_unregister; /* For shrinker support */ struct list_head s_list; struct mutex umount_mutex; unsigned int shrinker_run_no; /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ unsigned int allocate_section_hint; /* the boundary position between devices */ unsigned int allocate_section_policy; /* determine the section writing priority */ /* For write statistics */ u64 sectors_written_start; u64 kbytes_written; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; struct workqueue_struct *post_read_wq; /* post read workqueue */ /* * If we are in irq context, let's update error information into * on-disk superblock in the work. */ struct work_struct s_error_work; unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ unsigned char stop_reason[MAX_STOP_REASON]; /* stop reason */ spinlock_t error_lock; /* protect errors/stop_reason array */ bool error_dirty; /* errors of sb is dirty */ /* For reclaimed segs statistics per each GC mode */ unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ unsigned int seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ int max_fragment_chunk; /* max chunk size for block fragmentation mode */ int max_fragment_hole; /* max hole size for block fragmentation mode */ /* For atomic write statistics */ atomic64_t current_atomic_write; s64 peak_atomic_write; u64 committed_atomic_block; u64 revoked_atomic_block; /* carve out reserved_blocks from total blocks */ bool carve_out; /* max elapsed time threshold in critical region that lock covered */ unsigned long long max_lock_elapsed_time; /* enable/disable to adjust task priority in critical region covered by lock */ unsigned int adjust_lock_priority; /* adjust priority for task which is in critical region covered by lock */ unsigned int lock_duration_priority; /* priority for critical task, e.g. f2fs_ckpt, f2fs_gc threads */ long critical_task_priority; #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ /* For runtime compression statistics */ u64 compr_written_block; u64 compr_saved_block; u32 compr_new_inode; /* For compressed block cache */ struct inode *compress_inode; /* cache compressed blocks */ unsigned int compress_percent; /* cache page percentage */ unsigned int compress_watermark; /* cache page watermark */ atomic_t compress_page_hit; /* cache hit count */ #endif #ifdef CONFIG_F2FS_IOSTAT /* For app/fs IO statistics */ spinlock_t iostat_lock; unsigned long long iostat_count[NR_IO_TYPE]; unsigned long long iostat_bytes[NR_IO_TYPE]; unsigned long long prev_iostat_bytes[NR_IO_TYPE]; unsigned long long iostat_read_folio_count[NR_PAGE_ORDERS]; unsigned long long prev_iostat_read_folio_count[NR_PAGE_ORDERS]; bool iostat_enable; unsigned long iostat_next_period; unsigned int iostat_period_ms; /* For io latency related statistics info in one iostat period */ spinlock_t iostat_lat_lock; struct iostat_lat_info *iostat_io_lat; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key cp_global_sem_key; #endif }; /* Definitions to access f2fs_sb_info */ #define SEGS_TO_BLKS(sbi, segs) \ ((segs) << (sbi)->log_blocks_per_seg) #define BLKS_TO_SEGS(sbi, blks) \ ((blks) >> (sbi)->log_blocks_per_seg) #define BLKS_PER_SEG(sbi) ((sbi)->blocks_per_seg) #define BLKS_PER_SEC(sbi) (SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec)) #define SEGS_PER_SEC(sbi) ((sbi)->segs_per_sec) __printf(3, 4) void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...); #define f2fs_err(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__) #define f2fs_warn(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__) #define f2fs_notice(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__) #define f2fs_info(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__) #define f2fs_debug(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__) #define f2fs_err_ratelimited(sbi, fmt, ...) \ f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__) #define f2fs_warn_ratelimited(sbi, fmt, ...) \ f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__) #define f2fs_info_ratelimited(sbi, fmt, ...) \ f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__) #ifdef CONFIG_F2FS_FAULT_INJECTION #define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ __builtin_return_address(0)) static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, const char *func, const char *parent_func) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (!ffi->inject_rate) return false; if (!IS_FAULT_SET(ffi, type)) return false; atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); ffi->inject_count[type]++; f2fs_info_ratelimited(sbi, "inject %s in %s of %pS", f2fs_fault_name[type], func, parent_func); return true; } return false; } #else static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; } #endif /* * Test if the mounted volume is a multi-device volume. * - For a single regular disk volume, sbi->s_ndevs is 0. * - For a single zoned disk volume, sbi->s_ndevs is 1. * - For a multi-device volume, sbi->s_ndevs is always 2 or more. */ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) { return sbi->s_ndevs > 1; } static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { unsigned long now = jiffies; sbi->last_time[type] = now; /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ if (type == REQ_TIME) { sbi->last_time[DISCARD_TIME] = now; sbi->last_time[GC_TIME] = now; } } static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) { unsigned long interval = sbi->interval_time[type] * HZ; return time_after(jiffies, sbi->last_time[type] + interval); } static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, int type) { unsigned long interval = sbi->interval_time[type] * HZ; unsigned int wait_ms = 0; long delta; delta = (sbi->last_time[type] + interval) - jiffies; if (delta > 0) wait_ms = jiffies_to_msecs(delta); return wait_ms; } /* * Inline functions */ static inline u32 __f2fs_crc32(u32 crc, const void *address, unsigned int length) { return crc32(crc, address, length); } static inline u32 f2fs_crc32(const void *address, unsigned int length) { return __f2fs_crc32(F2FS_SUPER_MAGIC, address, length); } static inline u32 f2fs_chksum(u32 crc, const void *address, unsigned int length) { return __f2fs_crc32(crc, address, length); } static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); } static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) { return F2FS_SB(inode->i_sb); } static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) { return F2FS_I_SB(mapping->host); } static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio) { return F2FS_M_SB(folio->mapping); } static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); } static inline struct f2fs_super_block *F2FS_SUPER_BLOCK(struct folio *folio, pgoff_t index) { pgoff_t idx_in_folio = index % folio_nr_pages(folio); return (struct f2fs_super_block *) (page_address(folio_page(folio, idx_in_folio)) + F2FS_SUPER_OFFSET); } static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) { return (struct f2fs_checkpoint *)(sbi->ckpt); } static inline struct f2fs_node *F2FS_NODE(const struct folio *folio) { return (struct f2fs_node *)folio_address(folio); } static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio) { return &((struct f2fs_node *)folio_address(folio))->i; } static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); } static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_sm_info *)(sbi->sm_info); } static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) { return (struct sit_info *)(SM_I(sbi)->sit_info); } static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) { return (struct free_segmap_info *)(SM_I(sbi)->free_info); } static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) { return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) { return sbi->meta_inode->i_mapping; } static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) { return sbi->node_inode->i_mapping; } static inline bool is_meta_folio(struct folio *folio) { return folio->mapping == META_MAPPING(F2FS_F_SB(folio)); } static inline bool is_node_folio(struct folio *folio) { return folio->mapping == NODE_MAPPING(F2FS_F_SB(folio)); } static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) { return le64_to_cpu(cp->checkpoint_ver); } static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) { if (type < F2FS_MAX_QUOTAS) return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); return 0; } static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) { size_t crc_offset = le32_to_cpu(cp->checksum_offset); return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); } static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); return ckpt_flags & f; } static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); } static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags; ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { unsigned long flags; spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags; ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { unsigned long flags; spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); spin_unlock_irqrestore(&sbi->cp_lock, flags); } #define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE) #define init_f2fs_rwsem_trace __init_f2fs_rwsem #define __init_f2fs_rwsem(sem, sbi, name) \ do { \ static struct lock_class_key __key; \ \ do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name); \ } while (0) static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem, const char *sem_name, struct lock_class_key *key, struct f2fs_sb_info *sbi, enum f2fs_lock_name name) { sem->sbi = sbi; sem->name = name; __init_rwsem(&sem->internal_rwsem, sem_name, key); #ifdef CONFIG_F2FS_UNFAIR_RWSEM init_waitqueue_head(&sem->read_waiters); #endif } static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) { return rwsem_is_locked(&sem->internal_rwsem); } static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) { return rwsem_is_contended(&sem->internal_rwsem); } static inline void f2fs_down_read(struct f2fs_rwsem *sem) { #ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); #else down_read(&sem->internal_rwsem); #endif } static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) { return down_read_trylock(&sem->internal_rwsem); } static inline void f2fs_up_read(struct f2fs_rwsem *sem) { up_read(&sem->internal_rwsem); } static inline void f2fs_down_write(struct f2fs_rwsem *sem) { down_write(&sem->internal_rwsem); } #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) { down_read_nested(&sem->internal_rwsem, subclass); } static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) { down_write_nested(&sem->internal_rwsem, subclass); } #else #define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) #define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) #endif static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) { return down_write_trylock(&sem->internal_rwsem); } static inline void f2fs_up_write(struct f2fs_rwsem *sem) { up_write(&sem->internal_rwsem); #ifdef CONFIG_F2FS_UNFAIR_RWSEM wake_up_all(&sem->read_waiters); #endif } void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; unsigned char *nat_bits; /* * In order to re-enable nat_bits we need to call fsck.f2fs by * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, * so let's rely on regular fsck or unclean shutdown. */ if (lock) spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); nat_bits = NM_I(sbi)->nat_bits; NM_I(sbi)->nat_bits = NULL; if (lock) spin_unlock_irqrestore(&sbi->cp_lock, flags); kvfree(nat_bits); } static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, struct cp_control *cpc) { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) { int reason = CP_SYNC; if (test_opt(sbi, FASTBOOT)) reason = CP_FASTBOOT; if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) reason = CP_UMOUNT; return reason; } static inline bool __remain_node_summaries(int reason) { return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) { return ofs == XATTR_NODE_OFFSET; } static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { if (!inode) return true; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) return true; if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; if (cap && capable(CAP_SYS_RESOURCE)) return true; return false; } static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { block_t avail_user_block_count; avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; else avail_user_block_count = 0; } return avail_user_block_count; } static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count, bool partial) { long long diff = 0, release = 0; block_t avail_user_block_count; int ret; ret = dquot_reserve_block(inode, *count); if (ret) return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { release = *count; goto release_quota; } /* * let's increase this in prior to actual block count change in order * for f2fs_sync_file to avoid data races when deciding checkpoint. */ percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); avail_user_block_count = get_available_block_count(sbi, inode, true); diff = (long long)sbi->total_valid_block_count + *count - avail_user_block_count; if (unlikely(diff > 0)) { if (!partial) { spin_unlock(&sbi->stat_lock); release = *count; goto enospc; } if (diff > *count) diff = *count; *count -= diff; release = diff; if (!*count) { spin_unlock(&sbi->stat_lock); goto enospc; } } sbi->total_valid_block_count += (block_t)(*count); spin_unlock(&sbi->stat_lock); if (unlikely(release)) { percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); } f2fs_i_blocks_write(inode, *count, true, true); return 0; enospc: percpu_counter_sub(&sbi->alloc_valid_block_count, release); release_quota: dquot_release_reservation_block(inode, release); return -ENOSPC; } #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ static inline bool folio_test_f2fs_##name(const struct folio *folio) \ { \ unsigned long priv = (unsigned long)folio->private; \ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ (1UL << PAGE_PRIVATE_##flagname); \ return (priv & v) == v; \ } \ static inline bool page_private_##name(struct page *page) \ { \ return PagePrivate(page) && \ test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ } #define PAGE_PRIVATE_SET_FUNC(name, flagname) \ static inline void folio_set_f2fs_##name(struct folio *folio) \ { \ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ (1UL << PAGE_PRIVATE_##flagname); \ if (!folio->private) \ folio_attach_private(folio, (void *)v); \ else { \ v |= (unsigned long)folio->private; \ folio->private = (void *)v; \ } \ } \ static inline void set_page_private_##name(struct page *page) \ { \ if (!PagePrivate(page)) \ attach_page_private(page, (void *)0); \ set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ } #define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ static inline void folio_clear_f2fs_##name(struct folio *folio) \ { \ unsigned long v = (unsigned long)folio->private; \ \ v &= ~(1UL << PAGE_PRIVATE_##flagname); \ if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \ folio_detach_private(folio); \ else \ folio->private = (void *)v; \ } \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) \ detach_page_private(page); \ } PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); static inline unsigned long folio_get_f2fs_data(struct folio *folio) { unsigned long data = (unsigned long)folio->private; if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) return 0; return data >> PAGE_PRIVATE_MAX; } static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data) { data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX); if (!folio_test_private(folio)) folio_attach_private(folio, (void *)data); else folio->private = (void *)((unsigned long)folio->private | data); } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, block_t count) { blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; spin_lock(&sbi->stat_lock); if (unlikely(sbi->total_valid_block_count < count)) { f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%llu, count:%u", sbi->total_valid_block_count, inode->i_ino, count); sbi->total_valid_block_count = 0; set_sbi_flag(sbi, SBI_NEED_FSCK); } else { sbi->total_valid_block_count -= count; } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks = min(sbi->reserved_blocks, sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); if (unlikely(inode->i_blocks < sectors)) { f2fs_warn(sbi, "Inconsistent i_blocks, ino:%llu, iblocks:%llu, sectors:%llu", inode->i_ino, (unsigned long long)inode->i_blocks, (unsigned long long)sectors); set_sbi_flag(sbi, SBI_NEED_FSCK); return; } f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DENTS || count_type == F2FS_DIRTY_NODES || count_type == F2FS_DIRTY_META || count_type == F2FS_DIRTY_QDATA || count_type == F2FS_DIRTY_IMETA) set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); if (IS_NOQUOTA(inode)) inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) { if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); if (IS_NOQUOTA(inode)) dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void inc_atomic_write_cnt(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); u64 current_write; fi->atomic_write_cnt++; atomic64_inc(&sbi->current_atomic_write); current_write = atomic64_read(&sbi->current_atomic_write); if (current_write > sbi->peak_atomic_write) sbi->peak_atomic_write = current_write; } static inline void release_atomic_write_cnt(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); fi->atomic_write_cnt = 0; } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { return atomic_read(&sbi->nr_pages[count_type]); } static inline int get_dirty_pages(struct inode *inode) { return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1, BLKS_PER_SEC(sbi)); } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { return sbi->total_valid_block_count; } static inline block_t discard_blocks(struct f2fs_sb_info *sbi) { return sbi->discard_blks; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); /* return NAT or SIT bitmap */ if (flag == NAT_BITMAP) return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); else if (flag == SIT_BITMAP) return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); return 0; } static inline block_t __cp_payload(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); } static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { offset = (flag == SIT_BITMAP) ? le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; /* * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; return tmp_ptr + offset; } } static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); if (sbi->cur_cp_pack == 2) start_addr += BLKS_PER_SEG(sbi); return start_addr; } static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) { block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); if (sbi->cur_cp_pack == 1) start_addr += BLKS_PER_SEG(sbi); return start_addr; } static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) { sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; } static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } static inline bool __has_cursum_space(struct f2fs_sb_info *sbi, struct f2fs_journal *journal, unsigned int size, int type) { if (type == NAT_JOURNAL) return size <= MAX_NAT_JENTRIES(sbi, journal); return size <= MAX_SIT_JENTRIES(sbi, journal); } extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count, avail_user_node_count; unsigned int avail_user_block_count; int err; if (is_inode) { if (inode) { err = dquot_alloc_inode(inode); if (err) return err; } } else { err = dquot_reserve_block(inode, 1); if (err) return err; } if (time_to_inject(sbi, FAULT_BLOCK)) goto enospc; spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; avail_user_block_count = get_available_block_count(sbi, inode, test_opt(sbi, RESERVE_NODE)); if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; if (test_opt(sbi, RESERVE_NODE) && !__allow_reserved_root(sbi, inode, true)) avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); if (inode) { if (is_inode) f2fs_mark_inode_dirty_sync(inode, true); else f2fs_i_blocks_write(inode, 1, true, true); } percpu_counter_inc(&sbi->alloc_valid_block_count); return 0; enospc: if (is_inode) { if (inode) dquot_free_inode(inode); } else { dquot_release_reservation_block(inode, 1); } return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); if (unlikely(!sbi->total_valid_block_count || !sbi->total_valid_node_count)) { f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", sbi->total_valid_block_count, sbi->total_valid_node_count); set_sbi_flag(sbi, SBI_NEED_FSCK); } else { sbi->total_valid_block_count--; sbi->total_valid_node_count--; } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); if (is_inode) { dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%llu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); return; } f2fs_i_blocks_write(inode, 1, false, true); } } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { percpu_counter_dec(&sbi->total_valid_inode_count); } static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, pgoff_t index, bool for_write) { struct folio *folio; unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { fgf_t fgf_flags; if (!for_write) fgf_flags = FGP_LOCK | FGP_ACCESSED; else fgf_flags = FGP_LOCK; folio = __filemap_get_folio(mapping, index, fgf_flags, 0); if (!IS_ERR(folio)) return folio; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) return ERR_PTR(-ENOMEM); } if (!for_write) return filemap_grab_folio(mapping, index); flags = memalloc_nofs_save(); folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); memalloc_nofs_restore(flags); return folio; } static inline struct folio *f2fs_filemap_get_folio( struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return ERR_PTR(-ENOMEM); return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask); } static inline void f2fs_folio_put(struct folio *folio, bool unlock) { if (IS_ERR_OR_NULL(folio)) return; if (unlock) { f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); folio_unlock(folio); } folio_put(folio); } static inline void f2fs_put_page(struct page *page, bool unlock) { if (!page) return; f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) { if (dn->node_folio) f2fs_folio_put(dn->node_folio, true); if (dn->inode_folio && dn->node_folio != dn->inode_folio) f2fs_folio_put(dn->inode_folio, false); dn->node_folio = NULL; dn->inode_folio = NULL; } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, size_t size) { return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, gfp_t flags) { void *entry; entry = kmem_cache_alloc(cachep, flags); if (!entry) entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); return entry; } static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) { if (nofail) return f2fs_kmem_cache_alloc_nofail(cachep, flags); if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) return NULL; return kmem_cache_alloc(cachep, flags); } static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || get_pages(sbi, F2FS_DIO_WRITE)) return true; if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) return true; if (SM_I(sbi) && SM_I(sbi)->fcc_info && atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return true; return false; } static inline bool is_inflight_read_io(struct f2fs_sb_info *sbi) { return get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_DIO_READ); } static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { bool zoned_gc = (type == GC_TIME && F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_BLKZONED)); if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) return false; if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) return false; if (sbi->gc_mode == GC_URGENT_MID) return true; if (sbi->gc_mode == GC_URGENT_LOW && (type == DISCARD_TIME || type == GC_TIME)) return true; if (zoned_gc) return true; return f2fs_time_over(sbi, type); } static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { while (radix_tree_insert(root, index, item)) cond_resched(); } #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) static inline bool IS_INODE(const struct folio *folio) { struct f2fs_node *p = F2FS_NODE(folio); return RAW_IS_INODE(p); } static inline int offset_in_addr(struct f2fs_inode *i) { return (i->i_inline & F2FS_EXTRA_ATTR) ? (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; } static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } static inline int f2fs_has_extra_attr(struct inode *inode); static inline unsigned int get_dnode_base(struct inode *inode, struct folio *node_folio) { if (!IS_INODE(node_folio)) return 0; return inode ? get_extra_isize(inode) : offset_in_addr(&F2FS_NODE(node_folio)->i); } static inline __le32 *get_dnode_addr(struct inode *inode, struct folio *node_folio) { return blkaddr_in_node(F2FS_NODE(node_folio)) + get_dnode_base(inode, node_folio); } static inline block_t data_blkaddr(struct inode *inode, struct folio *node_folio, unsigned int offset) { return le32_to_cpu(*(get_dnode_addr(inode, node_folio) + offset)); } static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) { return data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node); } static inline int f2fs_test_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); return mask & *addr; } static inline void f2fs_set_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); *addr |= mask; } static inline void f2fs_clear_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); *addr &= ~mask; } static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) { int mask; int ret; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr |= mask; return ret; } static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) { int mask; int ret; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr &= ~mask; return ret; } static inline void f2fs_change_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); *addr ^= mask; } /* * On-disk inode flags (f2fs_inode::i_flags) */ #define F2FS_COMPR_FL 0x00000004 /* Compress file */ #define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ #define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ #define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ #define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ #define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ #define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */ #define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ F2FS_CASEFOLD_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) #define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL) static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) return flags; else if (S_ISREG(mode)) return flags & F2FS_REG_FLMASK; else return flags & F2FS_OTHER_FLMASK; } static inline void __mark_inode_dirty_flag(struct inode *inode, int flag, bool set) { switch (flag) { case FI_INLINE_XATTR: case FI_INLINE_DATA: case FI_INLINE_DENTRY: case FI_NEW_INODE: if (set) return; fallthrough; case FI_DATA_EXIST: case FI_PIN_FILE: case FI_COMPRESS_RELEASED: f2fs_mark_inode_dirty_sync(inode, true); } } static inline void set_inode_flag(struct inode *inode, int flag) { set_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, true); } static inline int is_inode_flag_set(struct inode *inode, int flag) { return test_bit(flag, F2FS_I(inode)->flags); } static inline void clear_inode_flag(struct inode *inode, int flag) { clear_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, false); } static inline bool f2fs_verity_in_progress(struct inode *inode) { return IS_ENABLED(CONFIG_FS_VERITY) && is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS); } static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) { if (inc) inc_nlink(inode); else drop_nlink(inode); f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); /* add = 1, claim = 1 should be dquot_reserve_block in pair */ if (add) { if (claim) dquot_claim_block(inode, diff); else dquot_alloc_block_nofail(inode, diff); } else { dquot_free_block(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } static inline bool f2fs_is_atomic_file(struct inode *inode); static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); if (i_size_read(inode) == i_size) return; i_size_write(inode, i_size); if (f2fs_is_atomic_file(inode)) return; f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_gc_failures_write(struct inode *inode, unsigned int count) { F2FS_I(inode)->i_gc_failures = count; f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) { struct f2fs_inode_info *fi = F2FS_I(inode); if (ri->i_inline & F2FS_INLINE_XATTR) set_bit(FI_INLINE_XATTR, fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) set_bit(FI_INLINE_DATA, fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) set_bit(FI_INLINE_DENTRY, fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) set_bit(FI_DATA_EXIST, fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) set_bit(FI_PIN_FILE, fi->flags); if (ri->i_inline & F2FS_COMPRESS_RELEASED) set_bit(FI_COMPRESS_RELEASED, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) ri->i_inline |= F2FS_EXTRA_ATTR; if (is_inode_flag_set(inode, FI_PIN_FILE)) ri->i_inline |= F2FS_PIN_FILE; if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) ri->i_inline |= F2FS_COMPRESS_RELEASED; } static inline int f2fs_has_extra_attr(struct inode *inode) { return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_XATTR); } static inline int f2fs_compressed_file(struct inode *inode) { return S_ISREG(inode->i_mode) && is_inode_flag_set(inode, FI_COMPRESSED_FILE); } static inline bool f2fs_need_compress_data(struct inode *inode) { int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; if (!f2fs_compressed_file(inode)) return false; if (compress_mode == COMPR_MODE_FS) return true; else if (compress_mode == COMPR_MODE_USER && is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) return true; return false; } static inline unsigned int addrs_per_page(struct inode *inode, bool is_inode) { unsigned int addrs = is_inode ? (CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode)) : DEF_ADDRS_PER_BLOCK; if (f2fs_compressed_file(inode)) return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); return addrs; } static inline void *inline_xattr_addr(struct inode *inode, const struct folio *folio) { struct f2fs_inode *ri = F2FS_INODE(folio); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)]); } static inline int inline_xattr_size(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) return get_inline_xattr_addrs(inode) * sizeof(__le32); return 0; } /* * Notice: check inline_data flag without inode page lock is unsafe. * It could change at any time by f2fs_convert_inline_folio(). */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_is_mmap_file(struct inode *inode) { return is_inode_flag_set(inode, FI_MMAP_FILE); } static inline bool f2fs_is_pinned_file(struct inode *inode) { return is_inode_flag_set(inode, FI_PIN_FILE); } static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_cow_file(struct inode *inode) { return is_inode_flag_set(inode, FI_COW_FILE); } static inline void *inline_data_addr(struct inode *inode, struct folio *folio) { __le32 *addr = get_dnode_addr(inode, folio); return (void *)(addr + DEF_INLINE_RESERVED_SIZE); } static inline int f2fs_has_inline_dentry(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline int is_file(struct inode *inode, int type) { return F2FS_I(inode)->i_advise & type; } static inline void set_file(struct inode *inode, int type) { if (is_file(inode, type)) return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { if (!is_file(inode, type)) return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } static inline bool f2fs_is_time_consistent(struct inode *inode) { struct timespec64 ts = inode_get_atime(inode); if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts)) return false; ts = inode_get_ctime(inode); if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) return false; ts = inode_get_mtime(inode); if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) return false; return true; } static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { bool ret; if (dsync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); spin_lock(&sbi->inode_lock[DIRTY_META]); ret = list_empty(&F2FS_I(inode)->gdirty_list); spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || file_keep_isize(inode) || i_size_read(inode) & ~PAGE_MASK) return false; if (!f2fs_is_time_consistent(inode)) return false; spin_lock(&F2FS_I(inode)->i_size_lock); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); spin_unlock(&F2FS_I(inode)->i_size_lock); return ret; } static inline bool f2fs_readonly(struct super_block *sb) { return sb_rdonly(sb); } static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; return kmalloc(size, flags); } static inline void *f2fs_getname(struct f2fs_sb_info *sbi) { if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; return __getname(); } static inline void f2fs_putname(char *buf) { __putname(buf); } static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); } static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KVMALLOC)) return NULL; return kvmalloc(size, flags); } static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); } static inline void *f2fs_vmalloc(struct f2fs_sb_info *sbi, size_t size) { if (time_to_inject(sbi, FAULT_VMALLOC)) return NULL; return vmalloc(size); } static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } static inline int get_inline_xattr_addrs(struct inode *inode) { return F2FS_I(inode)->i_inline_xattr_size; } #define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) #define F2FS_MIN_EXTRA_ATTR_SIZE (sizeof(__le32)) #define F2FS_TOTAL_EXTRA_ATTR_SIZE \ (offsetof(struct f2fs_inode, i_extra_end) - \ offsetof(struct f2fs_inode, i_extra_isize)) \ #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ #define __is_large_section(sbi) (SEGS_PER_SEC(sbi) > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); static inline void verify_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.", blkaddr, type); } static inline bool __is_valid_data_blkaddr(block_t blkaddr) { if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || blkaddr == COMPRESS_ADDR) return false; return true; } /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, bool readonly, bool need_lock); int f2fs_precache_extents(struct inode *inode); int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); int f2fs_pin_file_control(struct inode *inode, bool inc); /* * inode.c */ void f2fs_set_inode_flags(struct inode *inode); bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_update_inode(struct inode *inode, struct folio *node_folio); void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc); /* * namei.c */ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode); /* * dir.c */ #if IS_ENABLED(CONFIG_UNICODE) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname); void f2fs_free_casefolded_name(struct f2fs_filename *fname); #else static inline int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { return 0; } static inline void f2fs_free_casefolded_name(struct f2fs_filename *fname) { } #endif /* CONFIG_UNICODE */ int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct f2fs_filename *fname); int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_filename *fname); void f2fs_free_filename(struct f2fs_filename *fname); struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, const struct f2fs_filename *fname, int *max_slots, bool use_hash); int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct f2fs_filename *fname, struct folio *dfolio); void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, const struct f2fs_filename *fname, struct folio **res_folio); struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct folio **res_folio); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f); ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct folio **folio); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct folio *folio, struct inode *inode); bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, const struct f2fs_filename *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct fscrypt_str *name, f2fs_hash_t name_hash, unsigned int bit_pos); int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode); int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname); bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } /* * super.c */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_do_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ struct node_info; enum node_type; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); bool f2fs_in_warm_node_list(struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); bool f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); int f2fs_remove_inode_page(struct inode *inode); struct folio *f2fs_new_inode_folio(struct inode *inode); struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, enum node_type node_type); int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype, bool in_irq); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_write_single_node_folio(struct folio *node_folio, int sync_mode, bool mark_dirty, enum iostat_type io_type); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_node_manager_caches(void); void f2fs_destroy_node_manager_caches(void); /* * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); int f2fs_commit_atomic_write(struct inode *inode); void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, unsigned int len); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi, bool need_check); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi); int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno); void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, enum iostat_type io_type); void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_io_info *fio); int f2fs_inplace_write_data(struct f2fs_io_info *fio); void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr, bool from_gc); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type seg_type); int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt); void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, bool ordered, bool locked); #define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi, struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint); enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, unsigned int segno); static inline struct inode *fio_inode(struct f2fs_io_info *fio) { return fio->folio->mapping->host; } #define DEF_FRAGMENT_SIZE 4 #define MIN_FRAGMENT_SIZE 1 #define MAX_FRAGMENT_SIZE 512 static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; } /* * checkpoint.c */ void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks); long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write, enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_add_orphan_inode(struct inode *inode); void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); bool f2fs_is_cp_guaranteed(const struct folio *folio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi, struct folio *folio, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs); struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct folio *f2fs_get_new_data_folio(struct inode *inode, struct folio *ifolio, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int f2fs_encrypt_one_page(struct f2fs_io_info *fio); bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); int f2fs_write_single_data_page(struct folio *folio, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance); void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool f2fs_release_folio(struct folio *folio, gfp_t wait); bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct folio *folio); int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c */ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_gc_range(struct f2fs_sb_info *sbi, unsigned int start_seg, unsigned int end_seg, bool dry_run, unsigned int dry_run_sections); int f2fs_resize_fs(struct file *filp, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); /* victim selection function for cleaning and SSR */ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode, unsigned long long age, bool one_time); /* * recovery.c */ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); int __init f2fs_create_recovery_cache(void); void f2fs_destroy_recovery_cache(void); /* * debug.c */ #ifdef CONFIG_F2FS_STAT_FS enum { DEVSTAT_INUSE, DEVSTAT_DIRTY, DEVSTAT_FULL, DEVSTAT_FREE, DEVSTAT_PREFREE, DEVSTAT_MAX, }; struct f2fs_dev_stats { unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */ }; struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_cached[NR_EXTENT_CACHES]; unsigned long long hit_rbtree[NR_EXTENT_CACHES]; unsigned long long total_ext[NR_EXTENT_CACHES]; unsigned long long hit_total[NR_EXTENT_CACHES]; int ext_tree[NR_EXTENT_CACHES]; int zombie_tree[NR_EXTENT_CACHES]; int ext_node[NR_EXTENT_CACHES]; /* to count memory footprint */ unsigned long long ext_mem[NR_EXTENT_CACHES]; /* for read extent cache */ unsigned long long hit_largest; /* for block age extent cache */ unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, ndirty_all; unsigned int nquota_files, ndonate_files; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; int nr_wb_cp_data, nr_wb_data; int nr_rd_data, nr_rd_node, nr_rd_meta; int nr_dio_read, nr_dio_write; unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages, compress_pages; int compress_page_hit; int prefree_count, free_segs, free_secs; int cp_call_count[MAX_CALL_TYPE], cp_count; int gc_call_count[MAX_CALL_TYPE]; int gc_segs[2][2]; int gc_secs[2][2]; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; unsigned int defrag_blks; int blkoff[NR_CURSEG_TYPE]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; unsigned int dirty_seg[NR_CURSEG_TYPE]; unsigned int full_seg[NR_CURSEG_TYPE]; unsigned int valid_blks[NR_CURSEG_TYPE]; unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; unsigned long long base_mem, cache_mem, page_mem; struct f2fs_dev_stats *dev_stats; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) { return (struct f2fs_stat_info *)sbi->stat_info; } #define stat_inc_cp_call_count(sbi, foreground) \ atomic_inc(&sbi->cp_call_count[(foreground)]) #define stat_inc_cp_count(sbi) (F2FS_STAT(sbi)->cp_count++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) #define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) #define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ } while (0) #define stat_dec_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ } while (0) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ } while (0) #define stat_dec_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ } while (0) #define stat_inc_inline_dir(inode) \ do { \ if (f2fs_has_inline_dentry(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) #define stat_dec_inline_dir(inode) \ do { \ if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) #define stat_inc_compr_inode(inode) \ do { \ if (f2fs_compressed_file(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_dec_compr_inode(inode) \ do { \ if (f2fs_compressed_file(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_add_compr_blocks(inode, blocks) \ (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_swapfile_inode(inode) \ (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_dec_swapfile_inode(inode) \ (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_inc_atomic_inode(inode) \ (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) #define stat_dec_atomic_inode(inode) \ (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ atomic_inc(&(sbi)->meta_count[META_CP]); \ else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ atomic_inc(&(sbi)->meta_count[META_SIT]); \ else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ atomic_inc(&(sbi)->meta_count[META_NAT]); \ else if (blkaddr < SM_I(sbi)->main_blkaddr) \ atomic_inc(&(sbi)->meta_count[META_SSA]); \ } while (0) #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) #define stat_inc_gc_call_count(sbi, foreground) \ (F2FS_STAT(sbi)->gc_call_count[(foreground)]++) #define stat_inc_gc_sec_count(sbi, type, gc_type) \ (F2FS_STAT(sbi)->gc_secs[(type)][(gc_type)]++) #define stat_inc_gc_seg_count(sbi, type, gc_type) \ (F2FS_STAT(sbi)->gc_segs[(type)][(gc_type)]++) #define stat_inc_tot_blk_count(si, blks) \ ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_defrag_blk_count(sbi, blks) \ (F2FS_STAT(sbi)->defrag_blks += (blks)) int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else #define stat_inc_cp_call_count(sbi, foreground) do { } while (0) #define stat_inc_cp_count(sbi) do { } while (0) #define stat_io_skip_bggc_count(sbi) do { } while (0) #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) #define stat_inc_total_hit(sbi, type) do { } while (0) #define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) #define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) #define stat_dec_inline_inode(inode) do { } while (0) #define stat_inc_inline_dir(inode) do { } while (0) #define stat_dec_inline_dir(inode) do { } while (0) #define stat_inc_compr_inode(inode) do { } while (0) #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_swapfile_inode(inode) do { } while (0) #define stat_dec_swapfile_inode(inode) do { } while (0) #define stat_inc_atomic_inode(inode) do { } while (0) #define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) #define stat_inc_gc_call_count(sbi, foreground) do { } while (0) #define stat_inc_gc_sec_count(sbi, type, gc_type) do { } while (0) #define stat_inc_gc_seg_count(sbi, type, gc_type) do { } while (0) #define stat_inc_tot_blk_count(si, blks) do { } while (0) #define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) #define stat_inc_defrag_blk_count(sbi, blks) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; extern const struct address_space_operations f2fs_node_aops; extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; extern struct kmem_cache *f2fs_inode_entry_slab; /* * inline.c */ bool f2fs_may_inline_data(struct inode *inode); bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, u64 from); int f2fs_read_inline_data(struct inode *inode, struct folio *folio); int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct folio *folio); int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, const struct f2fs_filename *fname, struct folio **res_folio, bool use_hash); int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct folio *ifolio); int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode); bool f2fs_empty_inline_dir(struct inode *dir); int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr); int f2fs_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); /* * shrinker.c */ unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc); unsigned long f2fs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc); unsigned int f2fs_donate_files(void); void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); void f2fs_join_shrinker(struct f2fs_sb_info *sbi); void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); /* read extent cache ops */ void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, block_t *blkaddr); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); /* block age extent cache ops */ void f2fs_init_age_extent_tree(struct inode *inode); bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); void f2fs_update_age_extent_cache(struct dnode_of_data *dn); void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, unsigned int len); unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); /* * sysfs.c */ #define MIN_RA_MUL 2 #define MAX_RA_MUL 256 int __init f2fs_init_sysfs(void); void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations f2fs_verityops; /* * crypto support */ static inline bool f2fs_encrypted_file(struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_FS_ENCRYPTION file_set_encrypt(inode); f2fs_set_inode_flags(inode); #endif } /* * Returns true if the reads of the inode's data need to undergo some * postprocessing step, like decryption or authenticity verification. */ static inline bool f2fs_post_read_required(struct inode *inode) { return f2fs_encrypted_file(inode) || fsverity_active(inode) || f2fs_compressed_file(inode); } static inline bool f2fs_used_in_atomic_write(struct inode *inode) { return f2fs_is_atomic_file(inode) || f2fs_is_cow_file(inode); } static inline bool f2fs_meta_inode_gc_required(struct inode *inode) { return f2fs_post_read_required(inode) || f2fs_used_in_atomic_write(inode); } /* * compress.c */ #ifdef CONFIG_F2FS_FS_COMPRESSION enum cluster_check_type { CLUSTER_IS_COMPR, /* check only if compressed cluster */ CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */ CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ }; bool f2fs_is_compressed_page(struct folio *folio); struct folio *f2fs_compress_control_folio(struct folio *folio); int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio); bool f2fs_is_compress_backend_ready(struct inode *inode); bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, int index, int nr_pages, bool uptodate); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio); int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index); void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, struct readahead_control *rac, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task); void f2fs_put_folio_dic(struct folio *folio, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, unsigned int ofs_in_node); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len); bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr); void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); #define inc_compr_inode_stat(inode) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ sbi->compr_new_inode++; \ } while (0) #define add_compr_block_stat(inode, blocks) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ int diff = F2FS_I(inode)->i_cluster_size - blocks; \ sbi->compr_written_block += blocks; \ sbi->compr_saved_block += diff; \ } while (0) #else static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) return true; /* not support compression */ return false; } static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; } static inline struct folio *f2fs_compress_control_folio(struct folio *folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } static inline void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); } static inline void f2fs_put_folio_dic(struct folio *folio, bool in_task) { WARN_ON_ONCE(1); } static inline unsigned int f2fs_cluster_blocks_are_contiguous( struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; } static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { } static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr) { return false; } static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) static inline int f2fs_is_compressed_cluster( struct inode *inode, pgoff_t index) { return 0; } static inline bool f2fs_is_sparse_cluster( struct inode *inode, pgoff_t index) { return true; } static inline void f2fs_update_read_extent_tree_range_compressed( struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) { #ifdef CONFIG_F2FS_FS_COMPRESSION struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); fi->i_compress_algorithm = F2FS_OPTION(sbi).compress_algorithm; fi->i_log_cluster_size = F2FS_OPTION(sbi).compress_log_size; fi->i_compress_flag = F2FS_OPTION(sbi).compress_chksum ? BIT(COMPRESS_CHKSUM) : 0; fi->i_cluster_size = BIT(fi->i_log_cluster_size); if ((fi->i_compress_algorithm == COMPRESS_LZ4 || fi->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) fi->i_compress_level = F2FS_OPTION(sbi).compress_level; fi->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); return 0; #else return -EOPNOTSUPP; #endif } static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); f2fs_down_write(&fi->i_sem); if (!f2fs_compressed_file(inode)) { f2fs_up_write(&fi->i_sem); return true; } if (f2fs_is_mmap_file(inode) || atomic_read(&fi->writeback) || (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) { f2fs_up_write(&fi->i_sem); return false; } fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); f2fs_up_write(&fi->i_sem); return true; } #define F2FS_FEATURE_FUNCS(name, flagname) \ static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); F2FS_FEATURE_FUNCS(packed_ssa, PACKED_SSA); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi, unsigned int zone) { return test_bit(zone, FDEV(devi).blkz_seq); } static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) { return f2fs_zone_is_seq(sbi, devi, blkaddr / sbi->blocks_per_blkz); } #endif static inline int f2fs_bdev_index(struct f2fs_sb_info *sbi, struct block_device *bdev) { int i; if (!f2fs_is_multi_device(sbi)) return 0; for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).bdev == bdev) return i; WARN_ON(1); return -1; } static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { return f2fs_sb_has_blkzoned(sbi); } static inline bool f2fs_bdev_support_discard(struct block_device *bdev) { return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) { int i; if (!f2fs_is_multi_device(sbi)) return f2fs_bdev_support_discard(sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) if (f2fs_bdev_support_discard(FDEV(i).bdev)) return true; return false; } static inline unsigned int f2fs_hw_discard_granularity(struct f2fs_sb_info *sbi) { int i = 1; unsigned int discard_granularity = bdev_discard_granularity(sbi->sb->s_bdev); if (f2fs_is_multi_device(sbi)) for (; i < sbi->s_ndevs && !bdev_is_zoned(FDEV(i).bdev); i++) discard_granularity = max_t(unsigned int, discard_granularity, bdev_discard_granularity(FDEV(i).bdev)); return discard_granularity; } static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) { return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || f2fs_hw_should_discard(sbi); } static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) { int i; if (!f2fs_is_multi_device(sbi)) return bdev_read_only(sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) if (bdev_read_only(FDEV(i).bdev)) return true; return false; } static inline bool f2fs_dev_is_readonly(struct f2fs_sb_info *sbi) { return f2fs_sb_has_readonly(sbi) || f2fs_hw_is_readonly(sbi); } static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } static inline bool f2fs_is_sequential_zone_area(struct f2fs_sb_info *sbi, block_t blkaddr) { if (f2fs_sb_has_blkzoned(sbi)) { #ifdef CONFIG_BLK_DEV_ZONED int devi = f2fs_target_device_index(sbi, blkaddr); if (!bdev_is_zoned(FDEV(devi).bdev)) return false; if (f2fs_is_multi_device(sbi)) { if (blkaddr < FDEV(devi).start_blk || blkaddr > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkaddr); return false; } blkaddr -= FDEV(devi).start_blk; } return f2fs_blkz_is_seq(sbi, devi, blkaddr); #else return false; #endif } return false; } static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; } static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) || f2fs_is_mmap_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { struct f2fs_inode_info *fi = F2FS_I(inode); int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) return; if (add) { atomic_add(diff, &fi->i_compr_blocks); stat_add_compr_blocks(inode, diff); } else { atomic_sub(diff, &fi->i_compr_blocks); stat_sub_compr_blocks(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); } static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, int flag) { if (!f2fs_is_multi_device(sbi)) return false; if (flag != F2FS_GET_BLOCK_DIO) return false; return sbi->aligned_blksize; } #ifdef CONFIG_F2FS_FAULT_INJECTION extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, enum fault_option fo); extern void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi); #else static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, enum fault_option fo) { return 0; } static inline void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi) { return; } #endif static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA if (f2fs_sb_has_quota_ino(sbi)) return true; if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) return true; #endif return false; } static inline bool f2fs_quota_file(struct f2fs_sb_info *sbi, nid_t ino) { #ifdef CONFIG_QUOTA int i; if (!f2fs_sb_has_quota_ino(sbi)) return false; for (i = 0; i < MAXQUOTAS; i++) { if (f2fs_qf_ino(sbi->sb, i) == ino) return true; } #endif return false; } static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } static inline void __f2fs_schedule_timeout(long timeout, bool io) { set_current_state(TASK_UNINTERRUPTIBLE); if (io) io_schedule_timeout(timeout); else schedule_timeout(timeout); } #define f2fs_io_schedule_timeout(timeout) \ __f2fs_schedule_timeout(timeout, true) #define f2fs_schedule_timeout(timeout) \ __f2fs_schedule_timeout(timeout, false) static inline void f2fs_schedule_timeout_killable(long timeout, bool io) { unsigned long last_time = jiffies + timeout; while (jiffies < last_time) { if (fatal_signal_pending(current)) return; __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, io); } } static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, struct folio *folio, enum page_type type) { pgoff_t ofs = folio->index; if (unlikely(f2fs_cp_error(sbi))) return; if (ofs == sbi->page_eio_ofs[type]) { if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) { enum stop_cp_reason stop_reason; switch (type) { case META: stop_reason = STOP_CP_REASON_READ_META; break; case NODE: stop_reason = STOP_CP_REASON_READ_NODE; break; case DATA: stop_reason = STOP_CP_REASON_READ_DATA; break; default: f2fs_bug_on(sbi, 1); return; } f2fs_stop_checkpoint(sbi, false, stop_reason); } } else { sbi->page_eio_ofs[type] = ofs; sbi->page_eio_cnt[type] = 0; } } static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) { return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); } static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int cnt) { bool need_submit = false; int i = 0; do { struct folio *folio; folio = filemap_get_folio(META_MAPPING(sbi), blkaddr + i); if (!IS_ERR(folio)) { if (folio_test_writeback(folio)) need_submit = true; f2fs_folio_put(folio, false); } } while (++i < cnt && !need_submit); if (need_submit) f2fs_submit_merged_write_cond(sbi, sbi->meta_inode, NULL, 0, DATA); truncate_inode_pages_range(META_MAPPING(sbi), F2FS_BLK_TO_BYTES((loff_t)blkaddr), F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1))); } static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); f2fs_invalidate_compress_pages_range(sbi, blkaddr, len); } #endif /* _LINUX_F2FS_H */ |
| 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | // SPDX-License-Identifier: GPL-2.0-only /* * "security" table for IPv6 * * This is for use by Mandatory Access Control (MAC) security models, * which need to be able to manage security policy in separate context * to DAC. * * Based on iptable_mangle.c * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org> * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com> */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); #define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, }; static struct nf_hook_ops *sectbl_ops __read_mostly; static int ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; int ret; repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit ip6table_security_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "security"); } static struct pernet_operations ip6table_security_net_ops = { .pre_exit = ip6table_security_net_pre_exit, .exit = ip6table_security_net_exit, }; static int __init ip6table_security_init(void) { int ret = xt_register_template(&security_table, ip6table_security_table_init); if (ret < 0) return ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); if (IS_ERR(sectbl_ops)) { xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); } ret = register_pernet_subsys(&ip6table_security_net_ops); if (ret < 0) { kfree(sectbl_ops); xt_unregister_template(&security_table); return ret; } return ret; } static void __exit ip6table_security_fini(void) { unregister_pernet_subsys(&ip6table_security_net_ops); xt_unregister_template(&security_table); kfree(sectbl_ops); } module_init(ip6table_security_init); module_exit(ip6table_security_fini); |
| 28 130 130 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 | /* * Performance events x86 architecture code * * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> #include <linux/kvm_types.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> #include <linux/static_call.h> #include <linux/kvm_types.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/unwind.h> #include <asm/uprobes.h> #include <asm/ibt.h> #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, .pmu = &pmu, }; static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined * from just a typename, as opposed to an actual function. */ DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. */ DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; u64 delta; if (unlikely(!hwc->event_base)) return 0; /* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); do { new_raw_count = rdpmc(hwc->event_base_rdpmc); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } /* * Find and validate any extra registers to set up. */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; if (!extra_regs) return 0; for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; /* Check if the extra msrs can be safely accessed*/ if (!er->extra_msr_access) return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; reg->reg = er->msr; break; } return 0; } static atomic_t active_events; static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC static inline u64 get_possible_counter_mask(void) { u64 cntr_mask = x86_pmu.cntr_mask64; int i; if (!is_hybrid()) return cntr_mask; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; return cntr_mask; } static bool reserve_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i, end; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } return true; eventsel_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_evntsel_nmi(x86_pmu_config_addr(i)); i = X86_PMC_IDX_MAX; perfctr_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } static void release_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } } #else static bool reserve_pmc_hardware(void) { return true; } static void release_pmc_hardware(void) {} #endif bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, unsigned long *fixed_cntr_mask) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); ret = rdmsrq_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } else { reg_safe = i; } } if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrq_safe(reg, &val); if (ret) goto msr_fail; for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; } } } /* * If all the counters are enabled, the below test will always * fail. The tools will also become useless in this scenario. * Just fail and disable the hardware counters. */ if (reg_safe == -1) { reg = reg_safe; goto msr_fail; } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); if (rdmsrq_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrq_safe(reg, val); ret |= rdmsrq_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; /* * We still allow the PMU driver to operate: */ if (bios_fail) { pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); } return true; msr_fail: if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { pr_cont("PMU not available due to virtualization, using software events only.\n"); } else { pr_cont("Broken PMU hardware detected, using software events only.\n"); pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); } return false; } static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) { hw_perf_event_destroy(event); /* undo the lbr/bts event accounting */ x86_del_exclusive(x86_lbr_exclusive_lbr); } static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; } static inline int set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; config = attr->config; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; if (val == -1) return -EINVAL; hwc->config |= val; attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } int x86_reserve_hardware(void) { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) { err = -EBUSY; } else { reserve_ds_buffers(); reserve_lbr_buffers(); } } if (!err) atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } return err; } void x86_release_hardware(void) { if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } /* * Check if we can create event of a certain type (that no conflicting events * are present). */ int x86_add_exclusive(unsigned int what) { int i; /* * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto fail_unlock; } atomic_inc(&x86_pmu.lbr_exclusive[what]); mutex_unlock(&pmc_reserve_mutex); } out: atomic_inc(&active_events); return 0; fail_unlock: mutex_unlock(&pmc_reserve_mutex); return -EBUSY; } void x86_del_exclusive(unsigned int what) { atomic_dec(&active_events); /* * See the comment in x86_add_exclusive(). */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); } int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); /* * The generic map: */ config = x86_pmu.event_map(attr->config); if (config == 0) return -ENOENT; if (config == -1LL) return -EINVAL; hwc->config |= config; return 0; } /* * check that branch_sample_type is compatible with * settings needed for precise_ip > 1 which implies * using the LBR to capture ALL taken branches at the * priv levels of the measurement */ static inline int precise_br_compat(struct perf_event *event) { u64 m = event->attr.branch_sample_type; u64 b = 0; /* must capture all branches */ if (!(m & PERF_SAMPLE_BRANCH_ANY)) return 0; m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_user) b |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) b |= PERF_SAMPLE_BRANCH_KERNEL; /* * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 */ return m == b; } int x86_pmu_max_precise(struct pmu *pmu) { int precise = 0; if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { /* arch PEBS */ if (x86_pmu.arch_pebs) { precise = 2; if (hybrid(pmu, arch_pebs_cap).pdists) precise++; return precise; } /* legacy PEBS - support for constant skid */ precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; if (x86_pmu.pebs_prec_dist) precise++; } return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = x86_pmu_max_precise(event->pmu); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; /* There's no sense in having PEBS for non sampling events: */ if (!is_sampling_event(event)) return -EINVAL; } /* * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { if (!precise_br_compat(event)) return -EOPNOTSUPP; /* branch_sample_type is compatible */ } else { /* * user did not specify branch_sample_type * * For PEBS fixups, we capture all * the branches at the priv level of the * event. */ *br_type = PERF_SAMPLE_BRANCH_ANY; if (!event->attr.exclude_user) *br_type |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ event->hw.config = ARCH_PERFMON_EVENTSEL_INT; /* * Count user and OS events unless requested not to */ if (!event->attr.exclude_user) event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) return -EINVAL; } /* sample_regs_user never support XMM registers */ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) return -EINVAL; } return x86_setup_perfctr(event); } /* * Setup the hardware configuration for a given attr_type */ static int __x86_pmu_event_init(struct perf_event *event) { int err; if (!x86_pmu_initialized()) return -ENODEV; err = x86_reserve_hardware(); if (err) return err; atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrq(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrq(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) wrmsrq(x86_pmu_config_addr(idx + 1), 0); } } struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. * * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. * It will not be re-enabled in the NMI handler again, because enabled=0. After * handling the NMI, disable_all will be called, which will not change the * state either. If PMI hits after disable_all, the PMU is already disabled * before entering NMI handler. The NMI handler will not change the state * either. * * So either situation is harmless. */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; if (!cpuc->enabled) return; cpuc->n_added = 0; cpuc->enabled = 0; barrier(); static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } int is_x86_event(struct perf_event *event) { /* * For a non-hybrid platforms, the type of X86 pmu is * always PERF_TYPE_RAW. * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE * is a unique capability for the X86 PMU. * Use them to detect a X86 event. */ if (event->pmu->type == PERF_TYPE_RAW || event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) return true; return false; } struct pmu *x86_get_pmu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); /* * All CPUs of the hybrid type have been offline. * The x86_get_pmu() should not be invoked. */ if (WARN_ON_ONCE(!cpuc->pmu)) return &pmu; return cpuc->pmu; } /* * Event scheduler state: * * Assign events iterating over all events and counters, beginning * with events with least weights first. Keep the current iterator * state in struct sched_state. */ struct sched_state { int weight; int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ #define SCHED_STATES_MAX 2 struct perf_sched { int max_weight; int max_events; int max_gp; int saved_states; struct event_constraint **constraints; struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { if (constraints[idx]->weight == wmin) break; } sched->state.event = idx; /* start with min weight */ sched->state.weight = wmin; sched->state.unassigned = num; } static void perf_sched_save_state(struct perf_sched *sched) { if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; sched->saved[sched->saved_states] = sched->state; sched->saved_states++; } static bool perf_sched_restore_state(struct perf_sched *sched) { if (!sched->saved_states) return false; sched->saved_states--; sched->state = sched->saved[sched->saved_states]; /* this assignment didn't work out */ /* XXX broken vs EVENT_PAIR */ sched->state.used &= ~BIT_ULL(sched->state.counter); /* try the next one */ sched->state.counter++; return true; } /* * Select a counter for the current event to schedule. Return true on * success. */ static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; if (!sched->state.unassigned) return false; if (sched->state.event >= sched->max_events) return false; c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { u64 mask = BIT_ULL(idx); if (sched->state.used & mask) continue; sched->state.used |= mask; goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { u64 mask = BIT_ULL(idx); if (c->flags & PERF_X86_EVENT_PAIR) mask |= mask << 1; if (sched->state.used & mask) continue; if (sched->state.nr_gp++ >= sched->max_gp) return false; sched->state.used |= mask; goto done; } return false; done: sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); return true; } static bool perf_sched_find_counter(struct perf_sched *sched) { while (!__perf_sched_find_counter(sched)) { if (!perf_sched_restore_state(sched)) return false; } return true; } /* * Go through all unassigned events and find the next one to schedule. * Take events with the least weight first. Return true on success. */ static bool perf_sched_next_event(struct perf_sched *sched) { struct event_constraint *c; if (!sched->state.unassigned || !--sched->state.unassigned) return false; do { /* next event */ sched->state.event++; if (sched->state.event >= sched->max_events) { /* next weight */ sched->state.event = 0; sched->state.weight++; if (sched->state.weight > sched->max_weight) return false; } c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ return true; } /* * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) break; /* failed */ if (assign) assign[sched.state.event] = sched.state.counter; } while (perf_sched_next_event(&sched)); return sched.state.unassigned; } EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), * validate_group() and x86_pmu_commit_txn(). For the former two * cpuc->n_events hasn't been updated yet, while for the latter * cpuc->n_txn contains the number of events added in the current * transaction. */ n0 = cpuc->n_events; if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; /* * Previously scheduled events should have a cached constraint, * while new events should not have one. */ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } /* * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { u64 mask; hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) break; /* constraint still honored */ if (!test_bit(hwc->idx, c->idxmsk)) break; mask = BIT_ULL(hwc->idx); if (is_counter_pair(hwc)) mask |= mask << 1; /* not already used */ if (used_mask & mask) break; used_mask |= mask; if (assign) assign[i] = hwc->idx; } /* slow path */ if (i != n) { int gpmax = x86_pmu_max_num_counters(cpuc->pmu); /* * Do not allow scheduling of more than half the available * generic counters. * * This helps avoid counter starvation of sibling thread by * ensuring at most half the counters cannot be in exclusive * mode. There is no designated counters for the limits. Any * N/2 counters can be used. This helps with events with * specific counter constraints. */ if (is_ht_workaround_enabled() && !cpuc->is_fake && READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; /* * Reduce the amount of available counters to allow fitting * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { gpmax -= cpuc->n_pair; WARN_ON(gpmax <= 0); } unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } /* * In case of success (unsched = 0), mark events as committed, * so we do not put_constraint() in case new events are added * and fail to be scheduled * * We invoke the lower level commit callback to lock the resource * * We do not need to do all of this in case we are called to * validate an event group (assign == NULL) */ if (!unsched && assign) { for (i = 0; i < n; i++) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* * release events that failed scheduling */ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } static int add_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) { if (cpuc->n_metric == INTEL_TD_METRIC_NUM) return -EINVAL; cpuc->n_metric++; cpuc->n_txn_metric++; } return 0; } static void del_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) cpuc->n_metric--; } static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) return -EINVAL; cpuc->event_list[n] = event; if (is_counter_pair(&event->hw)) { cpuc->n_pair++; cpuc->n_txn_pair++; } return 0; } /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { struct perf_event *event; int n, max_count; max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); /* current number of events already accepted */ n = cpuc->n_events; if (!cpuc->n_events) cpuc->pebs_output = 0; if (!cpuc->is_fake && leader->attr.precise_ip) { /* * For PEBS->PT, if !aux_event, the group leader (PT) went * away, the group was broken down and this singleton event * can't schedule any more. */ if (is_pebs_pt(leader) && !leader->aux_event) return -EINVAL; /* * pebs_output: 0: no PEBS so far, 1: PT, 2: DS */ if (cpuc->pebs_output && cpuc->pebs_output != is_pebs_pt(leader) + 1) return -EINVAL; cpuc->pebs_output = is_pebs_pt(leader) + 1; } if (is_x86_event(leader)) { if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; n++; } if (!dogrp) return n; for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; if (collect_event(cpuc, event, max_count, n)) return -EINVAL; n++; } return n; } static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; int idx; idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; static_call_cond(x86_pmu_assign)(event, idx); switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); break; } } /** * x86_perf_rdpmc_index - Return PMC counter used for event * @event: the perf_event to which the PMC counter was assigned * * The counter assigned to this performance event may change if interrupts * are enabled. This counter should thus never be used while interrupts are * enabled. Before this function is used to obtain the assigned counter the * event should be checked for validity using, for example, * perf_event_read_local(), within the same interrupt disabled section in * which this counter is planned to be used. * * Return: The index of the performance monitoring counter assigned to * @perf_event. */ int x86_perf_rdpmc_index(struct perf_event *event) { lockdep_assert_irqs_disabled(); return event->hw.event_base_rdpmc; } static inline int match_prev_assignment(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; return hwc->idx == cpuc->assign[i] && hwc->last_cpu == smp_processor_id() && hwc->last_tag == cpuc->tags[i] && !is_acr_event_group(event); } static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; if (cpuc->enabled) return; if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; /* * The late setup (after counters are scheduled) * is required for some cases, e.g., PEBS counters * snapshotting. Because an accurate counter index * is needed. */ static_call_cond(x86_pmu_late_setup)(); /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; /* * we can avoid reprogramming counter if: * - assigned same counter as last time * - running on same CPU as last time * - no other event has used the counter since */ if (hwc->idx == -1 || match_prev_assignment(event, cpuc, i)) continue; /* * Ensure we don't accidentally enable a stopped * counter simply because we rescheduled. */ if (hwc->state & PERF_HES_STOPPED) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); cpuc->events[hwc->idx] = NULL; } /* * step2: reprogram moved events into new counters */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; if (!match_prev_assignment(event, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; cpuc->events[hwc->idx] = event; if (hwc->state & PERF_HES_ARCH) { static_call(x86_pmu_set_period)(event); continue; } /* * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); } cpuc->enabled = 1; barrier(); static_call(x86_pmu_enable_all)(added); } DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; if (unlikely(!hwc->event_base)) return 0; /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; if (left > x86_pmu.max_period) left = x86_pmu.max_period; static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ local64_set(&hwc->prev_count, (u64)-left); wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) wrmsrq(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); return ret; } void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } /* * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; hwc = &event->hw; n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) goto out; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (!(flags & PERF_EF_START)) hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. * * If commit fails, we'll call ->del() on all events * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: /* * Commit the collect_events() state. See x86_pmu_del() and * x86_pmu_*_txn(). */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; /* * This is before x86_pmu_enable() will call x86_pmu_start(), * so we enable LBRs before an event needs them etc.. */ static_call_cond(x86_pmu_add)(event); ret = 0; out: return ret; } static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; if (WARN_ON_ONCE(idx == -1)) return; if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); static_call(x86_pmu_set_period)(event); } event->hw.state = 0; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; unsigned long *cntr_mask, *fixed_cntr_mask; struct event_constraint *pebs_constraints; struct cpu_hw_events *cpuc; u64 pebs, debugctl; int cpu, idx; guard(irqsave)(); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_events, cpu); cntr_mask = hybrid(cpuc->pmu, cntr_mask); fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); if (!*(u64 *)cntr_mask) return; if (x86_pmu.version >= 2) { rdmsrq(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { rdmsrq(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { rdmsrq(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrq(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrq(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } } void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; __set_bit(event->hw.idx, cpuc->dirty); /* * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); cpuc->events[event->hw.idx] = NULL; for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) break; } if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ return; /* If we have a newly added event; make sure to decrease n_added. */ if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: /* * This is after x86_pmu_stop(); so we disable LBRs after any * event can need them etc.. */ static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; u64 last_period; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot * inside the nmi handler. As a result, the unmasking was pushed * into all the nmi handlers. * * This generic handler doesn't seem to have any issues where the * unmasking occurs so it was left at the top. */ apic_write(APIC_LVTPC, APIC_DM_NMI); for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; event = cpuc->events[idx]; last_period = event->hw.last_period; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ handled++; if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, last_period); perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); perf_event_overflow(event, &data, regs); } if (handled) inc_irq_stat(apic_perf_irqs); return handled; } void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); } #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU void perf_load_guest_lvtpc(u32 guest_lvtpc) { u32 masked = guest_lvtpc & APIC_LVT_MASKED; apic_write(APIC_LVTPC, APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); this_cpu_write(guest_lvtpc_loaded, true); } EXPORT_SYMBOL_FOR_KVM(perf_load_guest_lvtpc); void perf_put_guest_lvtpc(void) { this_cpu_write(guest_lvtpc_loaded, false); apic_write(APIC_LVTPC, APIC_DM_NMI); } EXPORT_SYMBOL_FOR_KVM(perf_put_guest_lvtpc); #endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; u64 finish_clock; int ret; /* * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due * to a PMI. Attempting to handle a PMI while the guest's context is * loaded will generate false positives and clobber guest state. Note, * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector * while host events are quiesced. */ if (this_cpu_read(guest_lvtpc_loaded)) return NMI_DONE; /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. */ if (!atomic_read(&active_events)) return NMI_DONE; start_clock = sched_clock(); ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; static int x86_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) return x86_pmu.cpu_prepare(cpu); return 0; } static int x86_pmu_dead_cpu(unsigned int cpu) { if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); return 0; } static int x86_pmu_online_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { kfree(cpuc->kfree_on_online[i]); cpuc->kfree_on_online[i] = NULL; } return 0; } static int x86_pmu_starting_cpu(unsigned int cpu) { if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); return 0; } static int x86_pmu_dying_cpu(unsigned int cpu) { if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); return 0; } static void __init pmu_check_apic(void) { if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware * events (user-space has to fall back and * sample via a hrtimer based software event): */ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); u64 config = 0; if (pmu_attr->id < x86_pmu.max_events) config = x86_pmu.event_map(pmu_attr->id); /* string trumps id */ if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } EXPORT_SYMBOL_GPL(events_sysfs_show); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_ht_attr *pmu_attr = container_of(attr, struct perf_pmu_events_ht_attr, attr); /* * Report conditional events depending on Hyper-Threading. * * This is overly conservative as usually the HT special * handling is not needed if the other CPU thread is idle. * * Note this does not (and cannot) handle the case when thread * siblings are invisible, for example with virtualization * if they are owned by some other guest. The user tool * has to re-read when a thread sibling gets onlined later. */ return sprintf(page, "%s", topology_max_smt_threads() > 1 ? pmu_attr->event_str_ht : pmu_attr->event_str_noht); } ssize_t events_hybrid_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr); struct x86_hybrid_pmu *pmu; const char *str, *next_str; int i; if (hweight64(pmu_attr->pmu_type) == 1) return sprintf(page, "%s", pmu_attr->event_str); /* * Hybrid PMUs may support the same event name, but with different * event encoding, e.g., the mem-loads event on an Atom PMU has * different event encoding from a Core PMU. * * The event_str includes all event encodings. Each event encoding * is divided by ";". The order of the event encodings must follow * the order of the hybrid PMU index. */ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); else return sprintf(page, "%s", str); } str = strchr(str, ';'); str++; } return 0; } EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); EVENT_ATTR(cache-misses, CACHE_MISSES ); EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); EVENT_ATTR(branch-misses, BRANCH_MISSES ); EVENT_ATTR(bus-cycles, BUS_CYCLES ); EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), EVENT_PTR(CACHE_MISSES), EVENT_PTR(BRANCH_INSTRUCTIONS), EVENT_PTR(BRANCH_MISSES), EVENT_PTR(BUS_CYCLES), EVENT_PTR(STALLED_CYCLES_FRONTEND), EVENT_PTR(STALLED_CYCLES_BACKEND), EVENT_PTR(REF_CPU_CYCLES), NULL, }; /* * Remove all undefined events (x86_pmu.event_map(id) == 0) * out of events_attr attributes. */ static umode_t is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct perf_pmu_events_attr *pmu_attr; if (idx >= x86_pmu.max_events) return 0; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); /* str trumps id */ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; } static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); ssize_t ret; /* * We have whole page size to spend and just little data * to write, so we can safely use sprintf. */ ret = sprintf(page, "event=0x%02llx", event); if (umask) ret += sprintf(page + ret, ",umask=0x%02llx", umask); if (edge) ret += sprintf(page + ret, ",edge"); if (pc) ret += sprintf(page + ret, ",pc"); if (any) ret += sprintf(page + ret, ",any"); if (inv) ret += sprintf(page + ret, ",inv"); if (cmask) ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); ret += sprintf(page + ret, "\n"); return ret; } static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; static void x86_pmu_static_call_update(void) { static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); static_call_update(x86_pmu_assign, x86_pmu.assign); static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable); static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable); static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all); static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all); } static void _x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(struct pmu *pmu) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); pr_info("... max period: %016llx\n", x86_pmu.max_period); pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); break; case X86_VENDOR_AMD: err = amd_pmu_init(); break; case X86_VENDOR_HYGON: err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: err = zhaoxin_pmu_init(); break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); err = 0; goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); /* enable userspace RDPMC usage by default */ x86_pmu.attr_rdpmc = X86_USER_RDPMC_CONDITIONAL_ENABLE; for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; if (!x86_pmu.config_mask) x86_pmu.config_mask = X86_RAW_EVENT_MASK; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, 0, x86_pmu_num_counters(NULL), 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; pmu.attr_update = x86_pmu.attr_update; if (!is_hybrid()) x86_pmu_show_pmu_cap(NULL); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; if (!x86_pmu.set_period) x86_pmu.set_period = x86_perf_event_set_period; if (!x86_pmu.update) x86_pmu.update = x86_perf_event_update; x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online * cpu. */ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; if (!is_hybrid()) { err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); if (err) goto out2; } else { struct x86_hybrid_pmu *hybrid_pmu; int i, j; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } if (i < x86_pmu.num_hybrid_pmus) { for (j = 0; j < i; j++) perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); pr_warn("Failed to register hybrid PMUs\n"); kfree(x86_pmu.hybrid_pmu); x86_pmu.hybrid_pmu = NULL; x86_pmu.num_hybrid_pmus = 0; goto out2; } } return 0; out2: cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); out_bad_pmu: memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); static void x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_read)(event); } /* * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time * * We only support PERF_PMU_TXN_ADD transactions. Save the * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD * transactions. */ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ cpuc->txn_flags = txn_flags; if (txn_flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); __this_cpu_write(cpu_hw_events.n_txn_pair, 0); __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* * Stop group events scheduling transaction * Clear the flag and pmu::enable() will perform the * schedulability test. */ static void x86_pmu_cancel_txn(struct pmu *pmu) { unsigned int txn_flags; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ txn_flags = cpuc->txn_flags; cpuc->txn_flags = 0; if (txn_flags & ~PERF_PMU_TXN_ADD) return; /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } /* * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success * * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { cpuc->txn_flags = 0; return 0; } n = cpuc->n_events; if (!x86_pmu_initialized()) return -EAGAIN; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } /* * a fake_cpuc is used to validate event groups. Due to * the extra reg logic, we need to also allocate a fake * per_core and per_cpu structure. Otherwise, group events * using extra reg may conflict without the kernel being * able to catch this when the last event gets added to * the group. */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { intel_cpuc_finish(cpuc); kfree(cpuc); } static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; int cpu; cpuc = kzalloc_obj(*cpuc); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; if (is_hybrid()) { struct x86_hybrid_pmu *h_pmu; h_pmu = hybrid_pmu(event_pmu); if (cpumask_empty(&h_pmu->supported_cpus)) goto error; cpu = cpumask_first(&h_pmu->supported_cpus); } else cpu = raw_smp_processor_id(); cpuc->pmu = event_pmu; if (intel_cpuc_prepare(cpuc, cpu)) goto error; return cpuc; error: free_fake_cpuc(cpuc); return ERR_PTR(-ENOMEM); } /* * validate that we can schedule this event */ static int validate_event(struct perf_event *event) { struct cpu_hw_events *fake_cpuc; struct event_constraint *c; int ret = 0; fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); free_fake_cpuc(fake_cpuc); return ret; } /* * validate a single event group * * validation include: * - check events are compatible which each other * - events do not compete for the same counter * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. */ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; /* * Reject events from different hybrid PMUs. */ if (is_hybrid()) { struct perf_event *sibling; struct pmu *pmu = NULL; if (is_x86_event(leader)) pmu = leader->pmu; for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) continue; if (!pmu) pmu = sibling->pmu; else if (pmu != sibling->pmu) return ret; } } fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ n = collect_events(fake_cpuc, leader, true); if (n < 0) goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) goto out; fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: free_fake_cpuc(fake_cpuc); return ret; } static int x86_pmu_event_init(struct perf_event *event) { struct x86_hybrid_pmu *pmu = NULL; int err; if ((event->attr.type != event->pmu->type) && (event->attr.type != PERF_TYPE_HARDWARE) && (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; if (is_hybrid() && (event->cpu != -1)) { pmu = hybrid_pmu(event->pmu); if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); } if (err) { if (event->destroy) event->destroy(event); event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } void perf_clear_dirty_counters(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* Don't need to clear the assigned counter. */ for (i = 0; i < cpuc->n_events; i++) __clear_bit(cpuc->assign[i], cpuc->dirty); if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; wrmsrq(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrq(x86_pmu_event_addr(i), 0); } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* * This function relies on not being called concurrently in two * tasks in the same mm. Otherwise one task could observe * perf_rdpmc_allowed > 1 and return all the way back to * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) return INTEL_PMC_FIXED_RDPMC_METRICS + 1; else return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } /* * Behaviors of rdpmc value: * - rdpmc = 0 * global user space rdpmc and counter level's user space rdpmc of all * counters are both disabled. * - rdpmc = 1 * global user space rdpmc is enabled in mmap enabled time window and * counter level's user space rdpmc is enabled for only non system-wide * events. Counter level's user space rdpmc of system-wide events is * still disabled by default. This won't introduce counter data leak for * non system-wide events since their count data would be cleared when * context switches. * - rdpmc = 2 * global user space rdpmc and counter level's user space rdpmc of all * counters are enabled unconditionally. * * Suppose the rdpmc value won't be changed frequently, don't dynamically * reschedule events to make the new rpdmc value take effect on active perf * events immediately, the new rdpmc value would only impact the new * activated perf events. This makes code simpler and cleaner. */ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { static DEFINE_MUTEX(rdpmc_mutex); unsigned long val; ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val > 2) return -EINVAL; if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; guard(mutex)(&rdpmc_mutex); if (val != x86_pmu.attr_rdpmc) { /* * Changing into or out of never available or always available, * aka perf-event-bypassing mode. This path is extremely slow, * but only root can trigger it, so it's okay. */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); else if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_NEVER_ENABLE) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); else if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_ALWAYS_ENABLE) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } return count; } static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); static struct attribute *x86_pmu_attrs[] = { &dev_attr_rdpmc.attr, NULL, }; static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct pmu *pmu = dev_get_drvdata(cdev); return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); } static DEVICE_ATTR_RO(max_precise); static struct attribute *x86_pmu_caps_attrs[] = { &dev_attr_max_precise.attr, NULL }; static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, &x86_pmu_caps_group, NULL, }; static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in) { static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in); } void perf_check_microcode(void) { if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } static int x86_pmu_check_period(struct perf_event *event, u64 value) { if (x86_pmu.check_period && x86_pmu.check_period(event, value)) return -EINVAL; if (value && x86_pmu.limit_period) { s64 left = value; x86_pmu.limit_period(event, &left); if (left > value) return -EINVAL; } return 0; } static int x86_pmu_aux_output_match(struct perf_event *event) { if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) return 0; if (x86_pmu.aux_output_match) return x86_pmu.aux_output_match(event); return 0; } static bool x86_pmu_filter(struct pmu *pmu, int cpu) { bool ret = false; static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); return ret; } static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, .event_mapped = x86_pmu_event_mapped, .event_unmapped = x86_pmu_event_unmapped, .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, .stop = x86_pmu_stop, .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) return; cyc2ns_read_begin(&data); offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; userpg->time_mult = data.cyc2ns_mul; userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = offset; } cyc2ns_read_end(); } /* * Determine whether the regs were taken from an irq/exception handler rather * than from perf_arch_fetch_caller_regs(). */ static bool perf_hw_regs(struct pt_regs *regs) { return regs->flags & X86_EFLAGS_FIXED; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } if (perf_hw_regs(regs)) { if (perf_callchain_store(entry, regs->ip)) return; unwind_start(&state, current, regs, NULL); } else { unwind_start(&state, current, NULL, (void *)regs->sp); } for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; } } static inline int valid_user_frame(const void __user *fp, unsigned long size) { return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* * If we're not in a valid context with a real (not just lazy) * user mm, then don't even try. */ if (!nmi_uaccess_okay()) return 0; /* IRQs are off, so this synchronizes with smp_store_release */ ldt = smp_load_acquire(¤t->mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; #else return 0; #endif } else { if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; } return get_desc_base(desc); } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; u32 ret_addr; if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); /* see perf_callchain_user() below for why we do this */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const u32 __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } pagefault_enable(); return 1; } #else static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const struct stack_frame __user *fp; unsigned long ret_addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } /* * We don't know what to do with VM86 stacks.. ignore them for now. */ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) return; pagefault_disable(); /* * If we are called from uprobe handler, and we are indeed at the very * entry to user function (which is normally a `push %rbp` instruction, * under assumption of application being compiled with frame pointers), * we should read return address from *regs->sp before proceeding * to follow frame pointers, otherwise we'll skip immediate caller * as %rbp is not yet setup. */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const unsigned long __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } pagefault_enable(); } /* * Deal with code segment offsets for the various execution modes: * * VM86 - the good olde 16 bit days, where the linear address is * 20 bits and we use regs->ip + 0x10 * regs->cs. * * IA32 - Where we need to look at GDT/LDT segment descriptor tables * to figure out what the 32bit base address is. * * X32 - has TIF_X32 set, but is running in x86_64 * * X86_64 - CS,DS,SS,ES are all zero based. */ static unsigned long code_segment_base(struct pt_regs *regs) { /* * For IA32 we look at the GDT/LDT segment base to convert the * effective IP to a linear address. */ #ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && regs->cs != __USER32_CS) return get_segment_base(regs->cs); #endif return 0; } unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return regs->ip + code_segment_base(regs); } static unsigned long common_misc_flags(struct pt_regs *regs) { if (regs->flags & PERF_EFLAGS_EXACT) return PERF_RECORD_MISC_EXACT_IP; return 0; } static unsigned long guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } static unsigned long host_misc_flags(struct pt_regs *regs) { if (user_mode(regs)) return PERF_RECORD_MISC_USER; else return PERF_RECORD_MISC_KERNEL; } unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= guest_misc_flags(regs); return flags; } unsigned long perf_arch_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= host_misc_flags(regs); return flags; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { /* This API doesn't currently support enumerating hybrid PMUs. */ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } /* * Note, hybrid CPU models get tracked as having hybrid PMUs even when * all E-cores are disabled via BIOS. When E-cores are disabled, the * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); } EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { int max = x86_pmu.max_events; if (hw_event < max) return x86_pmu.event_map(array_index_nospec(hw_event, max)); return 0; } EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config); |
| 93 570 571 5 569 28 8 567 568 31 31 31 31 30 31 271 16 257 258 259 62 62 62 62 62 62 62 271 273 272 273 271 62 272 17 271 268 64 709 64 64 63 64 59 60 60 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/percpu-refcount.h> /* * Initially, a percpu refcount is just a set of percpu counters. Initially, we * don't try to detect the ref hitting 0 - which means that get/put can just * increment or decrement the local counter. Note that the counter on a * particular cpu can (and will) wrap - this is fine, when we go to shutdown the * percpu counters will all sum to the correct value * * (More precisely: because modular arithmetic is commutative the sum of all the * percpu_count vars will be equal to what it would have been if all the gets * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect * the ref hitting 0 on every put - this would require global synchronization * and defeat the whole purpose of using percpu refs. * * What we do is require the user to keep track of the initial refcount; we know * the ref can't hit 0 before the user drops the initial ref, so as long as we * convert to non percpu mode before the initial ref is dropped everything * works. * * Converting to non percpu mode is done with some RCUish stuff in * percpu_ref_kill. Additionally, we need a bias value so that the * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * * Initializes @ref. @ref starts out in percpu mode with a refcount of 1 unless * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD. These flags * change the start state to atomic with the latter setting the initial refcount * to 0. See the definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); unsigned long start_count = 0; struct percpu_ref_data *data; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; data = kzalloc_obj(*ref->data, gfp); if (!data) { free_percpu((void __percpu *)ref->percpu_count_ptr); ref->percpu_count_ptr = 0; return -ENOMEM; } data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT; if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) { ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; data->allow_reinit = true; } else { start_count += PERCPU_COUNT_BIAS; } if (flags & PERCPU_REF_INIT_DEAD) ref->percpu_count_ptr |= __PERCPU_REF_DEAD; else start_count++; atomic_long_set(&data->count, start_count); data->release = release; data->confirm_switch = NULL; data->ref = ref; ref->data = data; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); static void __percpu_ref_exit(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { /* non-NULL confirm_switch indicates switching in progress */ WARN_ON_ONCE(ref->data && ref->data->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } /** * percpu_ref_exit - undo percpu_ref_init() * @ref: percpu_ref to exit * * This function exits @ref. The caller is responsible for ensuring that * @ref is no longer in active use. The usual places to invoke this * function from are the @ref->release() callback or in init failure path * where percpu_ref_init() succeeded but other parts of the initialization * of the embedding object failed. */ void percpu_ref_exit(struct percpu_ref *ref) { struct percpu_ref_data *data = ref->data; unsigned long flags; __percpu_ref_exit(ref); if (!data) return; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) << __PERCPU_REF_FLAG_BITS; ref->data = NULL; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); kfree(data); } EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; data->confirm_switch(ref); data->confirm_switch = NULL; wake_up_all(&percpu_ref_switch_waitq); if (!data->allow_reinit) __percpu_ref_exit(ref); /* drop ref from percpu_ref_switch_to_atomic() */ percpu_ref_put(ref); } static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; unsigned long __percpu *percpu_count = percpu_count_ptr(ref); static atomic_t underflows; unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); pr_debug("global %lu percpu %lu\n", atomic_long_read(&data->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum * to &ref->count; since gets could be happening on one cpu while puts * happen on another, adding a single cpu's count could cause * @ref->count to hit 0 before we've got a consistent value - but the * sum of all the counts will be consistent and correct. * * Subtracting the bias value then has to happen _after_ adding count to * &ref->count; we need the bias value to prevent &ref->count from * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count); if (WARN_ONCE(atomic_long_read(&data->count) <= 0, "percpu ref (%ps) <= 0 (%ld) after switching to atomic", data->release, atomic_long_read(&data->count)) && atomic_inc_return(&underflows) < 4) { pr_err("%s(): percpu_ref underflow", __func__); mem_dump_obj(data); } /* @ref is viewed as dead on all CPUs, send out switch confirmation */ percpu_ref_call_confirm_rcu(rcu); } static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) { } static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { if (confirm_switch) confirm_switch(ref); return; } /* switching from percpu to atomic */ ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; /* * Non-NULL ->confirm_switch is used to indicate that switching is * in progress. Use noop one if unspecified. */ ref->data->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; percpu_ref_get(ref); /* put after confirmation */ call_rcu_hurry(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; BUG_ON(!percpu_count); if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; if (WARN_ON_ONCE(!ref->data->allow_reinit)) return; atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count); /* * Restore per-cpu operation. smp_store_release() is paired * with READ_ONCE() in __ref_is_percpu() and guarantees that the * zeroing is visible to all percpu accesses which can see the * following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } static void __percpu_ref_switch_mode(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { struct percpu_ref_data *data = ref->data; lockdep_assert_held(&percpu_ref_switch_lock); /* * If the previous ATOMIC switching hasn't finished yet, wait for * its completion. If the caller ensures that ATOMIC switching * isn't in progress, this function can be called from any context. */ wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch, percpu_ref_switch_lock); if (data->force_atomic || percpu_ref_is_dying(ref)) __percpu_ref_switch_to_atomic(ref, confirm_switch); else __percpu_ref_switch_to_percpu(ref); } /** * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * @confirm_switch: optional confirmation callback * * There's no reason to use this function for the usual reference counting. * Use percpu_ref_kill[_and_confirm](). * * Schedule switching of @ref to atomic mode. All its percpu counts will * be collected to the main atomic counter. On completion, when all CPUs * are guaraneed to be in atomic mode, @confirm_switch, which may not * block, is invoked. This function may be invoked concurrently with all * the get/put operations and can safely be mixed with kill and reinit * operations. Note that @ref will stay in atomic mode across kill/reinit * cycles until percpu_ref_switch_to_percpu() is called. * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = true; __percpu_ref_switch_mode(ref, confirm_switch); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); /** * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * * Schedule switching the ref to atomic mode, and wait for the * switch to complete. Caller must ensure that no other thread * will switch back to percpu mode. */ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref) { percpu_ref_switch_to_atomic(ref, NULL); wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode * * There's no reason to use this function for the usual reference counting. * To re-use an expired ref, use percpu_ref_reinit(). * * Switch @ref to percpu mode. This function may be invoked concurrently * with all the get/put operations and can safely be mixed with kill and * reinit operations. This function reverses the sticky atomic state set * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = false; __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation * @ref: percpu_ref to kill * @confirm_kill: optional confirmation callback * * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be * called after @ref is seen as dead from all CPUs at which point all * further invocations of percpu_ref_tryget_live() will fail. See * percpu_ref_tryget_live() for details. * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). * * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ONCE(percpu_ref_is_dying(ref), "%s called more than once on %ps!", __func__, ref->data->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); /** * percpu_ref_is_zero - test whether a percpu refcount reached zero * @ref: percpu_ref to test * * Returns %true if @ref reached zero. * * This function is safe to call as long as @ref is between init and exit. */ bool percpu_ref_is_zero(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long count, flags; if (__ref_is_percpu(ref, &percpu_count)) return false; /* protect us from being destroyed */ spin_lock_irqsave(&percpu_ref_switch_lock, flags); if (ref->data) count = atomic_long_read(&ref->data->count); else count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); return count == 0; } EXPORT_SYMBOL_GPL(percpu_ref_is_zero); /** * percpu_ref_reinit - re-initialize a percpu refcount * @ref: perpcu_ref to re-initialize * * Re-initialize @ref so that it's in the same state as when it finished * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been * initialized successfully and reached 0 but not exited. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_reinit(struct percpu_ref *ref) { WARN_ON_ONCE(!percpu_ref_is_zero(ref)); percpu_ref_resurrect(ref); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); /** * percpu_ref_resurrect - modify a percpu refcount from dead to live * @ref: perpcu_ref to resurrect * * Modify @ref so that it's in the same state as before percpu_ref_kill() was * called. @ref must be dead but must not yet have exited. * * If @ref->release() frees @ref then the caller is responsible for * guaranteeing that @ref->release() does not get called while this * function is in progress. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_resurrect(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ON_ONCE(!percpu_ref_is_dying(ref)); WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_resurrect); |
| 1287 1600 1600 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 268 1 152 333 181 182 1 151 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | /* SPDX-License-Identifier: GPL-2.0 */ /* zpdesc.h: zsmalloc pool memory descriptor * * Written by Alex Shi <alexs@kernel.org> * Hyeonggon Yoo <42.hyeyoo@gmail.com> */ #ifndef __MM_ZPDESC_H__ #define __MM_ZPDESC_H__ #include <linux/migrate.h> #include <linux/pagemap.h> /* * struct zpdesc - Memory descriptor for zsmalloc pool memory. * @flags: Page flags, mostly unused by zsmalloc. * @lru: Indirectly used by page migration. * @movable_ops: Used by page migration. * @next: Next zpdesc in a zspage in zsmalloc pool. * @handle: For huge zspage in zsmalloc pool. * @zspage: Points to the zspage this zpdesc is a part of. * @first_obj_offset: First object offset in zsmalloc pool. * @_refcount: The number of references to this zpdesc. * * This struct overlays struct page for now. Do not modify without a good * understanding of the issues. In particular, do not expand into the overlap * with memcg_data. * * Page flags used: * * PG_private identifies the first component page. * * PG_locked is used by page migration code. */ struct zpdesc { unsigned long flags; struct list_head lru; unsigned long movable_ops; union { struct zpdesc *next; unsigned long handle; }; struct zspage *zspage; /* * Only the lower 24 bits are available for offset, limiting a page * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc. * * Do not access this field directly. * Instead, use {get,set}_first_obj_offset() helpers. */ unsigned int first_obj_offset; atomic_t _refcount; }; #define ZPDESC_MATCH(pg, zp) \ static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp)) ZPDESC_MATCH(flags, flags); ZPDESC_MATCH(lru, lru); ZPDESC_MATCH(mapping, movable_ops); ZPDESC_MATCH(__folio_index, next); ZPDESC_MATCH(__folio_index, handle); ZPDESC_MATCH(private, zspage); ZPDESC_MATCH(page_type, first_obj_offset); ZPDESC_MATCH(_refcount, _refcount); #undef ZPDESC_MATCH static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); /* * zpdesc_page - The first struct page allocated for a zpdesc * @zp: The zpdesc. * * A convenience wrapper for converting zpdesc to the first struct page of the * underlying folio, to communicate with code not yet converted to folio or * struct zpdesc. * */ #define zpdesc_page(zp) (_Generic((zp), \ const struct zpdesc *: (const struct page *)(zp), \ struct zpdesc *: (struct page *)(zp))) /** * zpdesc_folio - The folio allocated for a zpdesc * @zp: The zpdesc. * * Zpdescs are descriptors for zsmalloc memory. The memory itself is allocated * as folios that contain the zsmalloc objects, and zpdesc uses specific * fields in the first struct page of the folio - those fields are now accessed * by struct zpdesc. * * It is occasionally necessary convert to back to a folio in order to * communicate with the rest of the mm. Please use this helper function * instead of casting yourself, as the implementation may change in the future. */ #define zpdesc_folio(zp) (_Generic((zp), \ const struct zpdesc *: (const struct folio *)(zp), \ struct zpdesc *: (struct folio *)(zp))) /** * page_zpdesc - Converts from first struct page to zpdesc. * @p: The first (either head of compound or single) page of zpdesc. * * A temporary wrapper to convert struct page to struct zpdesc in situations * where we know the page is the compound head, or single order-0 page. * * Long-term ideally everything would work with struct zpdesc directly or go * through folio to struct zpdesc. * * Return: The zpdesc which contains this page */ #define page_zpdesc(p) (_Generic((p), \ const struct page *: (const struct zpdesc *)(p), \ struct page *: (struct zpdesc *)(p))) static inline void zpdesc_lock(struct zpdesc *zpdesc) { folio_lock(zpdesc_folio(zpdesc)); } static inline bool zpdesc_trylock(struct zpdesc *zpdesc) { return folio_trylock(zpdesc_folio(zpdesc)); } static inline void zpdesc_unlock(struct zpdesc *zpdesc) { folio_unlock(zpdesc_folio(zpdesc)); } static inline void zpdesc_wait_locked(struct zpdesc *zpdesc) { folio_wait_locked(zpdesc_folio(zpdesc)); } static inline void zpdesc_get(struct zpdesc *zpdesc) { folio_get(zpdesc_folio(zpdesc)); } static inline void zpdesc_put(struct zpdesc *zpdesc) { folio_put(zpdesc_folio(zpdesc)); } static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc) { return kmap_local_page(zpdesc_page(zpdesc)); } static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc) { return page_to_pfn(zpdesc_page(zpdesc)); } static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) { return page_zpdesc(pfn_to_page(pfn)); } static inline void __zpdesc_set_movable(struct zpdesc *zpdesc) { SetPageMovableOps(zpdesc_page(zpdesc)); } static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) { __SetPageZsmalloc(zpdesc_page(zpdesc)); } static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) { return page_zone(zpdesc_page(zpdesc)); } static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) { return folio_test_locked(zpdesc_folio(zpdesc)); } #endif |
| 1 1 1 1 1 1 1 1 1 5 1 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | // SPDX-License-Identifier: GPL-2.0-or-later /* * The Virtual DVB test driver serves as a reference DVB driver and helps * validate the existing APIs in the media subsystem. It can also aid * developers working on userspace applications. * * Copyright (C) 2020 Daniel W. S. Almeida * Based on the example driver written by Emard <emard@softhome.net> */ #include <linux/errno.h> #include <linux/i2c.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/workqueue.h> #include <media/dvb_frontend.h> #include "vidtv_demod.h" #define POLL_THRD_TIME 2000 /* ms */ static const struct vidtv_demod_cnr_to_qual_s vidtv_demod_c_cnr_2_qual[] = { /* from libdvbv5 source code, in milli db */ { QAM_256, FEC_NONE, 34000, 38000}, { QAM_64, FEC_NONE, 30000, 34000}, }; static const struct vidtv_demod_cnr_to_qual_s vidtv_demod_s_cnr_2_qual[] = { /* from libdvbv5 source code, in milli db */ { QPSK, FEC_1_2, 7000, 10000}, { QPSK, FEC_2_3, 9000, 12000}, { QPSK, FEC_3_4, 10000, 13000}, { QPSK, FEC_5_6, 11000, 14000}, { QPSK, FEC_7_8, 12000, 15000}, }; static const struct vidtv_demod_cnr_to_qual_s vidtv_demod_s2_cnr_2_qual[] = { /* from libdvbv5 source code, in milli db */ { QPSK, FEC_1_2, 9000, 12000}, { QPSK, FEC_2_3, 11000, 14000}, { QPSK, FEC_3_4, 12000, 15000}, { QPSK, FEC_5_6, 12000, 15000}, { QPSK, FEC_8_9, 13000, 16000}, { QPSK, FEC_9_10, 13500, 16500}, { PSK_8, FEC_2_3, 14500, 17500}, { PSK_8, FEC_3_4, 16000, 19000}, { PSK_8, FEC_5_6, 17500, 20500}, { PSK_8, FEC_8_9, 19000, 22000}, }; static const struct vidtv_demod_cnr_to_qual_s vidtv_demod_t_cnr_2_qual[] = { /* from libdvbv5 source code, in milli db*/ { QPSK, FEC_1_2, 4100, 5900}, { QPSK, FEC_2_3, 6100, 9600}, { QPSK, FEC_3_4, 7200, 12400}, { QPSK, FEC_5_6, 8500, 15600}, { QPSK, FEC_7_8, 9200, 17500}, { QAM_16, FEC_1_2, 9800, 11800}, { QAM_16, FEC_2_3, 12100, 15300}, { QAM_16, FEC_3_4, 13400, 18100}, { QAM_16, FEC_5_6, 14800, 21300}, { QAM_16, FEC_7_8, 15700, 23600}, { QAM_64, FEC_1_2, 14000, 16000}, { QAM_64, FEC_2_3, 19900, 25400}, { QAM_64, FEC_3_4, 24900, 27900}, { QAM_64, FEC_5_6, 21300, 23300}, { QAM_64, FEC_7_8, 22000, 24000}, }; static const struct vidtv_demod_cnr_to_qual_s *vidtv_match_cnr_s(struct dvb_frontend *fe) { const struct vidtv_demod_cnr_to_qual_s *cnr2qual = NULL; struct device *dev = fe->dvb->device; struct dtv_frontend_properties *c; u32 array_size = 0; u32 i; c = &fe->dtv_property_cache; switch (c->delivery_system) { case SYS_DVBT: case SYS_DVBT2: cnr2qual = vidtv_demod_t_cnr_2_qual; array_size = ARRAY_SIZE(vidtv_demod_t_cnr_2_qual); break; case SYS_DVBS: cnr2qual = vidtv_demod_s_cnr_2_qual; array_size = ARRAY_SIZE(vidtv_demod_s_cnr_2_qual); break; case SYS_DVBS2: cnr2qual = vidtv_demod_s2_cnr_2_qual; array_size = ARRAY_SIZE(vidtv_demod_s2_cnr_2_qual); break; case SYS_DVBC_ANNEX_A: cnr2qual = vidtv_demod_c_cnr_2_qual; array_size = ARRAY_SIZE(vidtv_demod_c_cnr_2_qual); break; default: dev_warn_ratelimited(dev, "%s: unsupported delivery system: %u\n", __func__, c->delivery_system); break; } for (i = 0; i < array_size; i++) if (cnr2qual[i].modulation == c->modulation && cnr2qual[i].fec == c->fec_inner) return &cnr2qual[i]; return NULL; /* not found */ } static void vidtv_clean_stats(struct dvb_frontend *fe) { struct dtv_frontend_properties *c = &fe->dtv_property_cache; /* Fill the length of each status counter */ /* Signal is always available */ c->strength.len = 1; c->strength.stat[0].scale = FE_SCALE_DECIBEL; c->strength.stat[0].svalue = 0; /* Usually available only after Viterbi lock */ c->cnr.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->cnr.stat[0].svalue = 0; c->cnr.len = 1; /* Those depends on full lock */ c->pre_bit_error.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->pre_bit_error.stat[0].uvalue = 0; c->pre_bit_error.len = 1; c->pre_bit_count.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->pre_bit_count.stat[0].uvalue = 0; c->pre_bit_count.len = 1; c->post_bit_error.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->post_bit_error.stat[0].uvalue = 0; c->post_bit_error.len = 1; c->post_bit_count.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->post_bit_count.stat[0].uvalue = 0; c->post_bit_count.len = 1; c->block_error.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->block_error.stat[0].uvalue = 0; c->block_error.len = 1; c->block_count.stat[0].scale = FE_SCALE_NOT_AVAILABLE; c->block_count.stat[0].uvalue = 0; c->block_count.len = 1; } static void vidtv_demod_update_stats(struct dvb_frontend *fe) { struct dtv_frontend_properties *c = &fe->dtv_property_cache; struct vidtv_demod_state *state = fe->demodulator_priv; u32 scale; if (state->status & FE_HAS_LOCK) { scale = FE_SCALE_COUNTER; c->cnr.stat[0].scale = FE_SCALE_DECIBEL; } else { scale = FE_SCALE_NOT_AVAILABLE; c->cnr.stat[0].scale = scale; } c->pre_bit_error.stat[0].scale = scale; c->pre_bit_count.stat[0].scale = scale; c->post_bit_error.stat[0].scale = scale; c->post_bit_count.stat[0].scale = scale; c->block_error.stat[0].scale = scale; c->block_count.stat[0].scale = scale; /* * Add a 0.5% of randomness at the signal strength and CNR, * and make them different, as we want to have something closer * to a real case scenario. * * Also, usually, signal strength is a negative number in dBm. */ c->strength.stat[0].svalue = state->tuner_cnr; c->strength.stat[0].svalue -= get_random_u32_below(state->tuner_cnr / 50); c->strength.stat[0].svalue -= 68000; /* Adjust to a better range */ c->cnr.stat[0].svalue = state->tuner_cnr; c->cnr.stat[0].svalue -= get_random_u32_below(state->tuner_cnr / 50); } static int vidtv_demod_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct vidtv_demod_state *state = fe->demodulator_priv; const struct vidtv_demod_cnr_to_qual_s *cnr2qual = NULL; struct vidtv_demod_config *config = &state->config; u16 snr = 0; /* Simulate random lost of signal due to a bad-tuned channel */ cnr2qual = vidtv_match_cnr_s(&state->frontend); if (cnr2qual && state->tuner_cnr < cnr2qual->cnr_good && state->frontend.ops.tuner_ops.get_rf_strength) { state->frontend.ops.tuner_ops.get_rf_strength(&state->frontend, &snr); if (snr < cnr2qual->cnr_ok) { /* eventually lose the TS lock */ if (get_random_u32_below(100) < config->drop_tslock_prob_on_low_snr) state->status = 0; } else { /* recover if the signal improves */ if (get_random_u32_below(100) < config->recover_tslock_prob_on_good_snr) state->status = FE_HAS_SIGNAL | FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC | FE_HAS_LOCK; } } vidtv_demod_update_stats(&state->frontend); *status = state->status; return 0; } static int vidtv_demod_read_signal_strength(struct dvb_frontend *fe, u16 *strength) { struct dtv_frontend_properties *c = &fe->dtv_property_cache; *strength = c->strength.stat[0].uvalue; return 0; } /* * NOTE: * This is implemented here just to be used as an example for real * demod drivers. * * Should only be implemented if it actually reads something from the hardware. * Also, it should check for the locks, in order to avoid report wrong data * to userspace. */ static int vidtv_demod_get_frontend(struct dvb_frontend *fe, struct dtv_frontend_properties *p) { return 0; } static int vidtv_demod_set_frontend(struct dvb_frontend *fe) { struct vidtv_demod_state *state = fe->demodulator_priv; u32 tuner_status = 0; int ret; if (!fe->ops.tuner_ops.set_params) return 0; fe->ops.tuner_ops.set_params(fe); /* store the CNR returned by the tuner */ ret = fe->ops.tuner_ops.get_rf_strength(fe, &state->tuner_cnr); if (ret < 0) return ret; fe->ops.tuner_ops.get_status(fe, &tuner_status); state->status = (state->tuner_cnr > 0) ? FE_HAS_SIGNAL | FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC | FE_HAS_LOCK : 0; vidtv_demod_update_stats(fe); if (fe->ops.i2c_gate_ctrl) fe->ops.i2c_gate_ctrl(fe, 0); return 0; } /* * NOTE: * This is implemented here just to be used as an example for real * demod drivers. * * Should only be implemented if the demod has support for DVB-S or DVB-S2 */ static int vidtv_demod_set_tone(struct dvb_frontend *fe, enum fe_sec_tone_mode tone) { return 0; } /* * NOTE: * This is implemented here just to be used as an example for real * demod drivers. * * Should only be implemented if the demod has support for DVB-S or DVB-S2 */ static int vidtv_demod_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { return 0; } /* * NOTE: * This is implemented here just to be used as an example for real * demod drivers. * * Should only be implemented if the demod has support for DVB-S or DVB-S2 */ static int vidtv_send_diseqc_msg(struct dvb_frontend *fe, struct dvb_diseqc_master_cmd *cmd) { return 0; } /* * NOTE: * This is implemented here just to be used as an example for real * demod drivers. * * Should only be implemented if the demod has support for DVB-S or DVB-S2 */ static int vidtv_diseqc_send_burst(struct dvb_frontend *fe, enum fe_sec_mini_cmd burst) { return 0; } static void vidtv_demod_release(struct dvb_frontend *fe) { struct vidtv_demod_state *state = fe->demodulator_priv; kfree(state); } static const struct dvb_frontend_ops vidtv_demod_ops = { .delsys = { SYS_DVBT, SYS_DVBT2, SYS_DVBC_ANNEX_A, SYS_DVBS, SYS_DVBS2, }, .info = { .name = "Dummy demod for DVB-T/T2/C/S/S2", .frequency_min_hz = 51 * MHz, .frequency_max_hz = 2150 * MHz, .frequency_stepsize_hz = 62500, .frequency_tolerance_hz = 29500 * kHz, .symbol_rate_min = 1000000, .symbol_rate_max = 45000000, .caps = FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_4_5 | FE_CAN_FEC_5_6 | FE_CAN_FEC_6_7 | FE_CAN_FEC_7_8 | FE_CAN_FEC_8_9 | FE_CAN_QAM_16 | FE_CAN_QAM_64 | FE_CAN_QAM_32 | FE_CAN_QAM_128 | FE_CAN_QAM_256 | FE_CAN_QAM_AUTO | FE_CAN_QPSK | FE_CAN_FEC_AUTO | FE_CAN_INVERSION_AUTO | FE_CAN_TRANSMISSION_MODE_AUTO | FE_CAN_GUARD_INTERVAL_AUTO | FE_CAN_HIERARCHY_AUTO, }, .release = vidtv_demod_release, .set_frontend = vidtv_demod_set_frontend, .get_frontend = vidtv_demod_get_frontend, .read_status = vidtv_demod_read_status, .read_signal_strength = vidtv_demod_read_signal_strength, /* For DVB-S/S2 */ .set_voltage = vidtv_demod_set_voltage, .set_tone = vidtv_demod_set_tone, .diseqc_send_master_cmd = vidtv_send_diseqc_msg, .diseqc_send_burst = vidtv_diseqc_send_burst, }; static const struct i2c_device_id vidtv_demod_i2c_id_table[] = { { "dvb_vidtv_demod" }, {} }; MODULE_DEVICE_TABLE(i2c, vidtv_demod_i2c_id_table); static int vidtv_demod_i2c_probe(struct i2c_client *client) { struct vidtv_tuner_config *config = client->dev.platform_data; struct vidtv_demod_state *state; /* allocate memory for the internal state */ state = kzalloc_obj(*state); if (!state) return -ENOMEM; /* create dvb_frontend */ memcpy(&state->frontend.ops, &vidtv_demod_ops, sizeof(struct dvb_frontend_ops)); memcpy(&state->config, config, sizeof(state->config)); state->frontend.demodulator_priv = state; i2c_set_clientdata(client, state); vidtv_clean_stats(&state->frontend); return 0; } static void vidtv_demod_i2c_remove(struct i2c_client *client) { struct vidtv_demod_state *state = i2c_get_clientdata(client); kfree(state); } static struct i2c_driver vidtv_demod_i2c_driver = { .driver = { .name = "dvb_vidtv_demod", .suppress_bind_attrs = true, }, .probe = vidtv_demod_i2c_probe, .remove = vidtv_demod_i2c_remove, .id_table = vidtv_demod_i2c_id_table, }; module_i2c_driver(vidtv_demod_i2c_driver); MODULE_DESCRIPTION("Virtual DVB Demodulator Driver"); MODULE_AUTHOR("Daniel W. S. Almeida"); MODULE_LICENSE("GPL"); |
| 3 2 2 1 2 3 1 1 1 1 3 3 2 2 1 2 3 1 1 1 1 2 2 2 2 440 440 440 440 440 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 | // SPDX-License-Identifier: GPL-2.0 /* * NFS exporting and validation. * * We maintain a list of clients, each of which has a list of * exports. To export an fs to a given client, you first have * to create the client entry with NFSCTL_ADDCLIENT, which * creates a client control block and adds it to the hash * table. Then, you call NFSCTL_EXPORT for each fs. * * * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/namei.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map * The other maps client+filehandle-fragment to export options. - the expkey map * * The export options are actually stored in the first map, and the * second map contains a reference to the entry in the first map. */ static struct workqueue_struct *nfsd_export_wq; #define EXPKEY_HASHBITS 8 #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static void expkey_release(struct work_struct *work) { struct svc_expkey *key = container_of(to_rcu_work(work), struct svc_expkey, ek_rwork); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); kfree(key); } static void expkey_put(struct kref *ref) { struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); INIT_RCU_WORK(&key->ek_rwork, expkey_release); queue_rcu_work(nfsd_export_wq, &key->ek_rwork); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client fsidtype \xfsid */ struct svc_expkey *ek = container_of(h, struct svc_expkey, h); char type[5]; qword_add(bpp, blen, ek->ek_client->name); snprintf(type, 5, "%d", ek->ek_fsidtype); qword_add(bpp, blen, type); qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); (*bpp)[-1] = '\n'; } static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; int err; u8 fsidtype; struct svc_expkey key; struct svc_expkey *ek = NULL; if (mesg[mlen - 1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); err = -ENOMEM; if (!buf) goto out; err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; dprintk("found domain %s\n", buf); err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; if (kstrtou8(buf, 10, &fsidtype)) goto out; dprintk("found fsidtype %u\n", fsidtype); if (key_len(fsidtype)==0) /* invalid type */ goto out; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; dprintk("found fsid length %d\n", len); if (len != key_len(fsidtype)) goto out; /* OK, we seem to have a valid key */ key.h.flags = 0; err = get_expiry(&mesg, &key.h.expiry_time); if (err) goto out; key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); ek = svc_expkey_lookup(cd, &key); err = -ENOMEM; if (!ek) goto out; /* now we want a pathname, or empty meaning NEGATIVE */ err = -EINVAL; len = qword_get(&mesg, buf, PAGE_SIZE); if (len < 0) goto out; dprintk("Path seems to be <%s>\n", buf); err = 0; if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, NULL); else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); if (err) goto out; dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, buf); else err = -ENOMEM; path_put(&key.ek_path); } cache_flush(); out: if (ek) cache_put(&ek->h, cd); if (dom) auth_domain_put(dom); kfree(buf); return err; } static int expkey_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_expkey *ek ; int i; if (h ==NULL) { seq_puts(m, "#domain fsidtype fsid [path]\n"); return 0; } ek = container_of(h, struct svc_expkey, h); seq_printf(m, "%s %d 0x", ek->ek_client->name, ek->ek_fsidtype); for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) seq_printf(m, "%08x", ek->ek_fsid[i]); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { seq_printf(m, " "); seq_path(m, &ek->ek_path, "\\ \t\n"); } seq_printf(m, "\n"); return 0; } static inline int expkey_match (struct cache_head *a, struct cache_head *b) { struct svc_expkey *orig = container_of(a, struct svc_expkey, h); struct svc_expkey *new = container_of(b, struct svc_expkey, h); if (orig->ek_fsidtype != new->ek_fsidtype || orig->ek_client != new->ek_client || memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) return 0; return 1; } static inline void expkey_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); kref_get(&item->ek_client->ref); new->ek_client = item->ek_client; new->ek_fsidtype = item->ek_fsidtype; memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); } static inline void expkey_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); new->ek_path = item->ek_path; path_get(&item->ek_path); } static struct cache_head *expkey_alloc(void) { struct svc_expkey *i = kmalloc_obj(*i); if (i) return &i->h; else return NULL; } static void expkey_flush(void) { /* * Take the nfsd_mutex here to ensure that the file cache is not * destroyed while we're in the middle of flushing. */ mutex_lock(&nfsd_mutex); nfsd_file_cache_purge(current->nsproxy->net_ns); mutex_unlock(&nfsd_mutex); } static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, .flush = expkey_flush, }; static int svc_expkey_hash(struct svc_expkey *item) { int hash = item->ek_fsidtype; char * cp = (char*)item->ek_fsid; int len = key_len(item->ek_fsidtype); hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); hash &= EXPKEY_HASHMASK; return hash; } static struct svc_expkey * svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) { struct cache_head *ch; int hash = svc_expkey_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } static struct svc_expkey * svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old) { struct cache_head *ch; int hash = svc_expkey_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } #define EXPORT_HASHBITS 8 #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { struct nfsd4_fs_location *locations = fsloc->locations; int i; if (!locations) return; for (i = 0; i < fsloc->locations_count; i++) { kfree(locations[i].path); kfree(locations[i].hosts); } kfree(locations); fsloc->locations = NULL; } static int export_stats_init(struct export_stats *stats) { stats->start_time = ktime_get_seconds(); return percpu_counter_init_many(stats->counter, 0, GFP_KERNEL, EXP_STATS_COUNTERS_NUM); } static void export_stats_reset(struct export_stats *stats) { if (stats) { int i; for (i = 0; i < EXP_STATS_COUNTERS_NUM; i++) percpu_counter_set(&stats->counter[i], 0); } } static void export_stats_destroy(struct export_stats *stats) { if (stats) percpu_counter_destroy_many(stats->counter, EXP_STATS_COUNTERS_NUM); } static void svc_export_release(struct work_struct *work) { struct svc_export *exp = container_of(to_rcu_work(work), struct svc_export, ex_rwork); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree(exp); } static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); INIT_RCU_WORK(&exp->ex_rwork, svc_export_release); queue_rcu_work(nfsd_export_wq, &exp->ex_rwork); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void svc_export_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client path */ struct svc_export *exp = container_of(h, struct svc_export, h); char *pth; qword_add(bpp, blen, exp->ex_client->name); pth = d_path(&exp->ex_path, *bpp, *blen); if (IS_ERR(pth)) { /* is this correct? */ (*bpp)[0] = '\n'; return; } qword_add(bpp, blen, pth); (*bpp)[-1] = '\n'; } static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); static int check_export(const struct path *path, int *flags, unsigned char *uuid) { struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 * pseudoroot) symlinks. */ if (!S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode) && !S_ISREG(inode->i_mode)) return -ENOTDIR; /* * Mountd should never pass down a writeable V4ROOT export, but, * just to make sure: */ if (*flags & NFSEXP_V4ROOT) *flags |= NFSEXP_READONLY; /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set and comply with * the requirements for remote filesystem export. * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } if (!exportfs_may_export(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type (%s).\n", inode->i_sb->s_type->name); return -EINVAL; } if (is_idmapped_mnt(path->mnt)) { dprintk("exp_export: export of idmapped mounts not yet supported.\n"); return -EINVAL; } if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && !(*flags & NFSEXP_NOSUBTREECHECK)) { dprintk("%s: %s does not support subtree checking!\n", __func__, inode->i_sb->s_type->name); return -EINVAL; } return 0; } #ifdef CONFIG_NFSD_V4 static int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { int len; int migrated, i, err; /* more than one fsloc */ if (fsloc->locations) return -EINVAL; /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) return err; if (fsloc->locations_count > MAX_FS_LOCATIONS) return -EINVAL; if (fsloc->locations_count == 0) return 0; fsloc->locations = kzalloc_objs(struct nfsd4_fs_location, fsloc->locations_count); if (!fsloc->locations) return -ENOMEM; for (i=0; i < fsloc->locations_count; i++) { /* colon separated host list */ err = -EINVAL; len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].hosts) goto out_free_all; err = -EINVAL; /* slash separated path component list */ len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].path) goto out_free_all; } /* migrated */ err = get_int(mesg, &migrated); if (err) goto out_free_all; err = -EINVAL; if (migrated < 0 || migrated > 1) goto out_free_all; fsloc->migrated = migrated; return 0; out_free_all: nfsd4_fslocs_free(fsloc); return err; } static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { struct exp_flavor_info *f; u32 listsize; int err; /* more than one secinfo */ if (exp->ex_nflavors) return -EINVAL; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { err = get_uint(mesg, &f->pseudoflavor); if (err) return err; /* * XXX: It would be nice to also check whether this * pseudoflavor is supported, so we can discover the * problem at export time instead of when a client fails * to authenticate. */ err = get_uint(mesg, &f->flags); if (err) return err; /* Only some flags are allowed to differ between flavors: */ if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) return -EINVAL; } exp->ex_nflavors = listsize; return 0; } #else /* CONFIG_NFSD_V4 */ static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) { unsigned int i, mode, listsize; int err; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > NFSEXP_XPRTSEC_NUM) return -EINVAL; exp->ex_xprtsec_modes = 0; for (i = 0; i < listsize; i++) { err = get_uint(mesg, &mode); if (err) return err; if (mode > NFSEXP_XPRTSEC_MTLS) return -EINVAL; exp->ex_xprtsec_modes |= mode; } return 0; } static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; /* more than one uuid */ if (*puuid) return -EINVAL; /* expect a 16 byte uuid encoded as \xXXXX... */ len = qword_get(mesg, buf, PAGE_SIZE); if (len != EX_UUID_LEN) return -EINVAL; *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); if (*puuid == NULL) return -ENOMEM; return 0; } static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; int an_int; if (mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; /* client */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; /* path */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); if (err) goto out1; exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ err = get_expiry(&mesg, &exp.h.expiry_time); if (err) goto out3; /* flags */ err = get_int(&mesg, &an_int); if (err == -ENOENT) { err = 0; set_bit(CACHE_NEGATIVE, &exp.h.flags); } else { if (err || an_int < 0) goto out3; exp.ex_flags= an_int; /* anon uid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_uid= make_kuid(current_user_ns(), an_int); /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_gid= make_kgid(current_user_ns(), an_int); /* fsid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_fsid = an_int; while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else if (strcmp(buf, "xprtsec") == 0) err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set * new values, then see what the result was. */ break; if (err) goto out4; } err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the * anon_uid/anon_gid checks: */ if (exp.h.expiry_time < seconds_since_boot()) goto out4; /* * For some reason exportfs has been passing down an * invalid (-1) uid & gid on the "dummy" export which it * uses to test export support. To make sure exportfs * sees errors from check_export we therefore need to * delay these checks till after check_export: */ err = -EINVAL; if (!uid_valid(exp.ex_anon_uid)) goto out4; if (!gid_valid(exp.ex_anon_gid)) goto out4; err = 0; nfsd4_setup_layout_type(&exp); } expp = svc_export_lookup(&exp); if (!expp) { err = -ENOMEM; goto out4; } expp = svc_export_update(&exp, expp); if (expp) { trace_nfsd_export_update(expp); cache_flush(); exp_put(expp); } else err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: path_put(&exp.ex_path); out1: auth_domain_put(dom); out: kfree(buf); return err; } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int is_export_stats_file(struct seq_file *m) { /* * The export_stats file uses the same ops as the exports file. * We use the file's name to determine the reported info per export. * There is no rename in nsfdfs, so d_name.name is stable. */ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); } static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_export *exp; bool export_stats = is_export_stats_file(m); if (h == NULL) { if (export_stats) seq_puts(m, "#path domain start-time\n#\tstats\n"); else seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { struct percpu_counter *counter = exp->ex_stats->counter; seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } seq_putc(m, '('); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); } } show_secinfo(m, exp); } seq_puts(m, ")\n"); return 0; } static int svc_export_match(struct cache_head *a, struct cache_head *b) { struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; new->ex_path = item->ex_path; path_get(&item->ex_path); new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); int i; new->ex_flags = item->ex_flags; new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; new->ex_devid_map = item->ex_devid_map; item->ex_devid_map = NULL; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc_obj(*i); if (!i) return NULL; i->ex_stats = kmalloc_obj(*(i->ex_stats)); if (!i->ex_stats) { kfree(i); return NULL; } if (export_stats_init(i->ex_stats)) { kfree(i->ex_stats); kfree(i); return NULL; } return &i->h; } static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, .init = svc_export_init, .update = export_update, .alloc = svc_export_alloc, }; static int svc_export_hash(struct svc_export *exp) { int hash; hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); return hash; } static struct svc_export * svc_export_lookup(struct svc_export *exp) { struct cache_head *ch; int hash = svc_export_hash(exp); ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_export * svc_export_update(struct svc_export *new, struct svc_export *old) { struct cache_head *ch; int hash = svc_export_hash(old); ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_expkey * exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; int err; if (!clp) return ERR_PTR(-ENOENT); key.ek_client = clp; key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); ek = svc_expkey_lookup(cd, &key); if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); if (err) { trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); } return ek; } static struct svc_export * exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; if (!clp) return ERR_PTR(-ENOENT); key.ex_client = clp; key.ex_path = *path; key.cd = cd; exp = svc_export_lookup(&key); if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); if (err) { trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); } return exp; } /* * Find the export entry for a given dentry. */ static struct svc_export * exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = exp_get_by_name(cd, clp, path, NULL); } dput(path->dentry); path->dentry = saved; return exp; } /* * Obtain the root fh on behalf of a client. * This could be done in user space, but I feel that it adds some safety * since its harder to fool a kernel module than a user space program. */ int exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; struct path path; struct inode *inode __maybe_unused; struct svc_fh fh; int err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; err = -EPERM; /* NB: we probably ought to check that it's NUL-terminated */ if (kern_path(name, 0, &path)) { printk("nfsd: exp_rootfh path not found %s", name); return err; } inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%llu)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(cd, clp, &path); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; } /* * fh must be initialized before calling fh_compose */ fh_init(&fh, maxsize); if (fh_compose(&fh, exp, path.dentry, NULL)) err = -EINVAL; else err = 0; memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); fh_put(&fh); exp_put(exp); out: path_put(&path); return err; } static struct svc_export *exp_find(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_export *exp; struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id); struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp); if (IS_ERR(ek)) return ERR_CAST(ek); exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp); cache_put(&ek->h, nn->svc_expkey_cache); if (IS_ERR(exp)) return ERR_CAST(exp); return exp; } /** * check_xprtsec_policy - check if access to export is allowed by the * xprtsec policy * @exp: svc_export that is being accessed. * @rqstp: svc_rqst attempting to access @exp. * * Helper function for check_nfsd_access(). Note that callers should be * using check_nfsd_access() instead of calling this function directly. The * one exception is __fh_verify() since it has logic that may result in one * or both of the helpers being skipped. * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ __be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) return nfs_ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) return nfs_ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) return nfs_ok; } return nfserr_wrongsec; } /** * check_security_flavor - check if access to export is allowed by the * security flavor * @exp: svc_export that is being accessed. * @rqstp: svc_rqst attempting to access @exp. * @may_bypass_gss: reduce strictness of authorization check * * Helper function for check_nfsd_access(). Note that callers should be * using check_nfsd_access() instead of calling this function directly. The * one exception is __fh_verify() since it has logic that may result in one * or both of the helpers being skipped. * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ __be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, bool may_bypass_gss) { struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return nfs_ok; /* ip-address based client; check sec= export option: */ for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) return nfs_ok; } /* defaults in absence of sec= options: */ if (exp->ex_nflavors == 0) { if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return nfs_ok; } /* If the compound op contains a spo_must_allowed op, * it will be sent with integrity/protection which * will have to be expressly allowed on mounts that * don't support it */ if (nfsd4_spo_must_allow(rqstp)) return nfs_ok; /* Some calls may be processed without authentication * on GSS exports. For example NFS2/3 calls on root * directory, see section 2.3.2 of rfc 2623. * For "may_bypass_gss" check that export has really * enabled some flavor with authentication (GSS or any * other) and also check that the used auth flavor is * without authentication (none or sys). */ if (may_bypass_gss && ( rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)) { for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor >= RPC_AUTH_DES) return 0; } } return nfserr_wrongsec; } /** * check_nfsd_access - check if access to export is allowed. * @exp: svc_export that is being accessed. * @rqstp: svc_rqst attempting to access @exp. * @may_bypass_gss: reduce strictness of authorization check * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, bool may_bypass_gss) { __be32 status; status = check_xprtsec_policy(exp, rqstp); if (status != nfs_ok) return status; return check_security_flavor(exp, rqstp, may_bypass_gss); } /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; * otherwise, will try to use rq_gssclient. * * Called from functions that handle requests; functions that do work on * behalf of mountd are passed a single client name to use, and should * use exp_get_by_name() or exp_find(). */ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, const struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } /** * rqst_exp_find - Find an svc_export in the context of a rqst or similar * @reqp: The handle to be used to suspend the request if a cache-upcall is needed * If NULL, missing in-cache information will result in failure. * @net: The network namespace in which the request exists * @cl: default auth_domain to use for looking up the export * @gsscl: an alternate auth_domain defined using deprecated gss/krb5 format. * @fsid_type: The type of fsid to look for * @fsidv: The actual fsid to look up in the context of either client. * * Perform a lookup for @cl/@fsidv in the given @net for an export. If * none found and @gsscl specified, repeat the lookup. * * Returns an export, or an error pointer. */ struct svc_export * rqst_exp_find(struct cache_req *reqp, struct net *net, struct auth_domain *cl, struct auth_domain *gsscl, int fsid_type, u32 *fsidv) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct cache_detail *cd = nn->svc_export_cache; if (!cl) goto gss; /* First try the auth_unix client: */ exp = exp_find(cd, cl, fsid_type, fsidv, reqp); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (!gsscl) return exp; gssexp = exp_find(cd, gsscl, fsid_type, fsidv, reqp); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } struct svc_export * rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = rqst_exp_get_by_name(rqstp, path); } dput(path->dentry); path->dentry = saved; return exp; } struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); return rqst_exp_find(&rqstp->rq_chandle, SVC_NET(rqstp), rqstp->rq_client, rqstp->rq_gssclient, FSID_NUM, fsidv); } /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the * export point with fsid==0 */ __be32 exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); exp_put(exp); return rv; } static struct flags { int flag; char *name[2]; } expflags[] = { { NFSEXP_READONLY, {"ro", "rw"}}, { NFSEXP_INSECURE_PORT, {"insecure", ""}}, { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { NFSEXP_SIGN_FH, {"sign_fh", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, { 0, {"", ""}} }; static void show_expflags(struct seq_file *m, int flags, int mask) { struct flags *flg; int state, first = 0; for (flg = expflags; flg->flag; flg++) { if (flg->flag & ~mask) continue; state = (flg->flag & flags) ? 0 : 1; if (*flg->name[state]) seq_printf(m, "%s%s", first++?",":"", flg->name[state]); } } static void show_secinfo_flags(struct seq_file *m, int flags) { seq_printf(m, ","); show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); } static bool secinfo_flags_equal(int f, int g) { f &= NFSEXP_SECINFO_FLAGS; g &= NFSEXP_SECINFO_FLAGS; return f == g; } static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) { int flags; flags = (*fp)->flags; seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); (*fp)++; while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { seq_printf(m, ":%d", (*fp)->pseudoflavor); (*fp)++; } return flags; } static void show_secinfo(struct seq_file *m, struct svc_export *exp) { struct exp_flavor_info *f; struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; int flags; if (exp->ex_nflavors == 0) return; f = exp->ex_flavors; flags = show_secinfo_run(m, &f, end); if (!secinfo_flags_equal(flags, exp->ex_flags)) show_secinfo_flags(m, flags); while (f != end) { flags = show_secinfo_run(m, &f, end); show_secinfo_flags(m, flags); } } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { struct user_namespace *userns = m->file->f_cred->user_ns; show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) && !uid_eq(anonu, make_kuid(userns, 0x10000-2))) seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu)); if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) && !gid_eq(anong, make_kgid(userns, 0x10000-2))) seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; seq_printf(m, ",%s=", loctype); seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); for (i = 1; i < fsloc->locations_count; i++) { seq_putc(m, ';'); seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); } } } static int e_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); if (export_stats) seq_puts(m, "# Path Client Start-time\n#\tStats\n"); else seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } if (cache_check_rcu(cd, &exp->h, NULL)) return 0; return svc_export_show(m, cd, cp); } const struct seq_operations nfs_exports_op = { .start = cache_seq_start_rcu, .next = cache_seq_next_rcu, .stop = cache_seq_stop_rcu, .show = e_show, }; /** * nfsd_export_wq_init - allocate the export release workqueue * * Called once at module load. The workqueue runs deferred svc_export and * svc_expkey release work scheduled by queue_rcu_work() in the cache put * callbacks. * * Return values: * %0: workqueue allocated * %-ENOMEM: allocation failed */ int nfsd_export_wq_init(void) { nfsd_export_wq = alloc_workqueue("nfsd_export", WQ_UNBOUND, 0); if (!nfsd_export_wq) return -ENOMEM; return 0; } /** * nfsd_export_wq_shutdown - drain and free the export release workqueue * * Called once at module unload. Per-namespace teardown in * nfsd_export_shutdown() has already drained all deferred work. */ void nfsd_export_wq_shutdown(void) { destroy_workqueue(nfsd_export_wq); } /* * Initialize the exports module. */ int nfsd_export_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) return PTR_ERR(nn->svc_export_cache); rv = cache_register_net(nn->svc_export_cache, net); if (rv) goto destroy_export_cache; nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net); if (IS_ERR(nn->svc_expkey_cache)) { rv = PTR_ERR(nn->svc_expkey_cache); goto unregister_export_cache; } rv = cache_register_net(nn->svc_expkey_cache, net); if (rv) goto destroy_expkey_cache; return 0; destroy_expkey_cache: cache_destroy_net(nn->svc_expkey_cache, net); unregister_export_cache: cache_unregister_net(nn->svc_export_cache, net); destroy_export_cache: cache_destroy_net(nn->svc_export_cache, net); return rv; } /* * Flush exports table - called when last nfsd thread is killed */ void nfsd_export_flush(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_purge(nn->svc_expkey_cache); cache_purge(nn->svc_export_cache); } /* * Shutdown the exports module. */ void nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); /* Drain deferred export and expkey release work. */ rcu_barrier(); flush_workqueue(nfsd_export_wq); cache_destroy_net(nn->svc_expkey_cache, net); cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); } |
| 11 1 27 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | // SPDX-License-Identifier: GPL-2.0 #ifndef __KVM_X86_MMU_TDP_MMU_H #define __KVM_X86_MMU_TDP_MMU_H #include <linux/kvm_host.h> #include "spte.h" void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { return refcount_inc_not_zero(&root->tdp_mmu_root_count); } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); enum kvm_tdp_mmu_root_types { KVM_INVALID_ROOTS = BIT(0), KVM_DIRECT_ROOTS = BIT(1), KVM_MIRROR_ROOTS = BIT(2), KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS, KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS, }; static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm, enum kvm_gfn_range_filter process) { enum kvm_tdp_mmu_root_types ret = 0; if (!kvm_has_mirrored_tdp(kvm)) return KVM_DIRECT_ROOTS; if (process & KVM_FILTER_PRIVATE) ret |= KVM_MIRROR_ROOTS; if (process & KVM_FILTER_SHARED) ret |= KVM_DIRECT_ROOTS; WARN_ON_ONCE(!ret); return ret; } static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr))) return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); return root_to_sp(vcpu->arch.mmu->root.hpa); } static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, enum kvm_tdp_mmu_root_types type) { if (unlikely(type == KVM_MIRROR_ROOTS)) return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); return root_to_sp(vcpu->arch.mmu->root.hpa); } bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, enum kvm_tdp_mmu_root_types root_types); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level, bool shared); static inline void kvm_tdp_mmu_walk_lockless_begin(void) { rcu_read_lock(); } static inline void kvm_tdp_mmu_walk_lockless_end(void) { rcu_read_unlock(); } int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } #else static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I2C message transfer tracepoints * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #undef TRACE_SYSTEM #define TRACE_SYSTEM i2c #if !defined(_TRACE_I2C_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_I2C_H #include <linux/i2c.h> #include <linux/tracepoint.h> /* * drivers/i2c/i2c-core-base.c */ extern int i2c_transfer_trace_reg(void); extern void i2c_transfer_trace_unreg(void); /* * __i2c_transfer() write request */ TRACE_EVENT_FN(i2c_write, TP_PROTO(const struct i2c_adapter *adap, const struct i2c_msg *msg, int num), TP_ARGS(adap, msg, num), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, msg_nr ) __field(__u16, addr ) __field(__u16, flags ) __field(__u16, len ) __dynamic_array(__u8, buf, msg->len) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->msg_nr = num; __entry->addr = msg->addr; __entry->flags = msg->flags; __entry->len = msg->len; memcpy(__get_dynamic_array(buf), msg->buf, msg->len); ), TP_printk("i2c-%d #%u a=%03x f=%04x l=%u [%*phD]", __entry->adapter_nr, __entry->msg_nr, __entry->addr, __entry->flags, __entry->len, __entry->len, __get_dynamic_array(buf) ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); /* * __i2c_transfer() read request */ TRACE_EVENT_FN(i2c_read, TP_PROTO(const struct i2c_adapter *adap, const struct i2c_msg *msg, int num), TP_ARGS(adap, msg, num), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, msg_nr ) __field(__u16, addr ) __field(__u16, flags ) __field(__u16, len ) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->msg_nr = num; __entry->addr = msg->addr; __entry->flags = msg->flags; __entry->len = msg->len; ), TP_printk("i2c-%d #%u a=%03x f=%04x l=%u", __entry->adapter_nr, __entry->msg_nr, __entry->addr, __entry->flags, __entry->len ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); /* * __i2c_transfer() read reply */ TRACE_EVENT_FN(i2c_reply, TP_PROTO(const struct i2c_adapter *adap, const struct i2c_msg *msg, int num), TP_ARGS(adap, msg, num), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, msg_nr ) __field(__u16, addr ) __field(__u16, flags ) __field(__u16, len ) __dynamic_array(__u8, buf, msg->len) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->msg_nr = num; __entry->addr = msg->addr; __entry->flags = msg->flags; __entry->len = msg->len; memcpy(__get_dynamic_array(buf), msg->buf, msg->len); ), TP_printk("i2c-%d #%u a=%03x f=%04x l=%u [%*phD]", __entry->adapter_nr, __entry->msg_nr, __entry->addr, __entry->flags, __entry->len, __entry->len, __get_dynamic_array(buf) ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); /* * __i2c_transfer() result */ TRACE_EVENT_FN(i2c_result, TP_PROTO(const struct i2c_adapter *adap, int num, int ret), TP_ARGS(adap, num, ret), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, nr_msgs ) __field(__s16, ret ) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->nr_msgs = num; __entry->ret = ret; ), TP_printk("i2c-%d n=%u ret=%d", __entry->adapter_nr, __entry->nr_msgs, __entry->ret ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); #endif /* _TRACE_I2C_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 2387 1322 1322 1322 1320 1506 1506 215 1511 1515 18 1511 1505 1509 1517 215 4564 4562 4561 4564 4554 18 108 107 55 4564 4564 4560 4559 4561 4557 441 4568 4566 4558 4556 4567 4557 5741 5749 2 5750 5736 5749 4010 5743 2 5750 5734 109 109 109 109 109 109 4 2 2 87 8 86 16 72 86 71 86 86 84 87 87 84 78 78 78 286 286 286 286 286 286 286 286 1752 169 207 207 206 175 102 140 62 175 1785 1787 1781 1576 784 428 242 622 1787 93 94 52 51 52 52 94 765 1840 1840 1789 765 702 356 349 302 192 175 302 765 5712 5711 5711 5574 5203 5701 2186 2186 32 2185 38 38 38 38 34 34 34 34 10 10 33 33 34 33 33 17 17 17 17 17 17 17 17 56 56 56 56 56 47 24 56 56 56 56 1746 1678 1751 108 108 38 1751 9 1753 38 1755 1752 1752 37 37 36 17 17 14 45 44 44 43 1751 1751 1749 7 7 7 497 500 499 499 501 499 500 499 499 500 501 500 501 488 501 489 500 332 185 5043 5049 5040 1283 5055 5043 5046 1649 54 54 1620 1621 1616 582 1613 5049 1288 5035 4669 5049 1614 1614 1614 1282 357 355 357 354 358 359 521 518 521 364 364 354 364 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/folio_batch.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; static const int page_cluster_max = 31; struct cpu_fbatches { /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch lru_activate; #endif /* Protecting the following batches which require disabling interrupts */ local_lock_t lock_irq; struct folio_batch lru_move_tail; }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { if (folio_test_lru(folio)) { folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); } } /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void page_cache_release(struct folio *folio) { struct lruvec *lruvec = NULL; unsigned long flags; __page_cache_release(folio, &lruvec, &flags); if (lruvec) lruvec_unlock_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; } if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ if (move_fn != lru_add && !folio_test_clear_lru(folio)) continue; folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) lruvec_unlock_irqrestore(lruvec, flags); folios_put(fbatch); } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, struct folio *folio, move_fn_t move_fn, bool disable_irq) { unsigned long flags; folio_get(folio); if (disable_irq) local_lock_irqsave(&cpu_fbatches.lock_irq, flags); else local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); else local_unlock(&cpu_fbatches.lock); } #define folio_batch_add_and_move(folio, op) \ __folio_batch_add_and_move( \ &cpu_fbatches.op, \ folio, \ op, \ offsetof(struct cpu_fbatches, op) >= \ offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { if (folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || folio_test_unevictable(folio) || !folio_test_lru(folio)) return; folio_batch_add_and_move(folio, lru_move_tail); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) __releases(rcu) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); rcu_read_unlock(); return; } for (;;) { unsigned long lrusize; /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); if (!lruvec) { rcu_read_unlock(); break; } spin_lock_irq(&lruvec->lru_lock); } } void lru_note_cost_refault(struct folio *folio) { struct lruvec *lruvec; lruvec = folio_lruvec_lock_irq(folio); lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_active(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { if (folio_test_active(folio) || folio_test_unevictable(folio) || !folio_test_lru(folio)) return; folio_batch_add_and_move(folio, lru_activate); } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (!folio_test_clear_lru(folio)) return; lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); lruvec_unlock_irq(lruvec); folio_set_lru(folio); } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return; } do { if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { if (!folio_test_workingset(folio)) folio_set_workingset(folio); return; } new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) { int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); unsigned long seq; if (gen < 0) return true; set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); rcu_read_lock(); seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]); rcu_read_unlock(); /* whether can do without shuffling under the LRU lock */ return gen == lru_gen_from_seq(seq); } #else /* !CONFIG_LRU_GEN */ static void lru_gen_inc_refs(struct folio *folio) { } static bool lru_gen_clear_refs(struct folio *folio) { return false; } #endif /* CONFIG_LRU_GEN */ /** * folio_mark_accessed - Mark a folio as having seen activity. * @folio: The folio to mark. * * This function will perform one of the following transitions: * * * inactive,unreferenced -> inactive,referenced * * inactive,referenced -> active,unreferenced * * active,unreferenced -> active,referenced * * When a newly allocated folio is not yet visible, so safe for non-atomic ops, * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { if (folio_test_dropbehind(folio)) return; if (lru_gen_enabled()) { lru_gen_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_batch_add_and_move(folio, lru_add); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropriate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); if (lru_gen_enabled()) lru_gen_clear_refs(folio); else folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add); fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&cpu_fbatches.lock_irq, flags); folio_batch_move_lru(fbatch, lru_move_tail); local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; folio_batch_add_and_move(folio, lru_deactivate_file); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate); } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || !folio_test_lru(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_lazyfree); } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); /* It helps everyone if we do our own local drain immediately. */ lru_add_drain(); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of folios to be migrated using folio_isolate_lru(). * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need * to reinitialise it. If @refs is NULL, we subtract one from each * folio refcount. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); continue; } folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; j++; } if (lruvec) lruvec_unlock_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; } folios->nr = j; mem_cgroup_uncharge_folios(folios); free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; int refs[FOLIO_BATCH_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; folio_batch_init(&fbatch); for (i = 0; i < nr; i++) { /* Turn any of the argument types into a folio */ struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ refs[fbatch.nr] = 1; if (unlikely(encoded_page_flags(encoded[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); if (folio_batch_add(&fbatch, folio) > 0) continue; folios_put_refs(&fbatch, refs); } if (fbatch.nr) folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } #ifdef CONFIG_MEMCG static void lruvec_reparent_lru(struct lruvec *child_lruvec, struct lruvec *parent_lruvec, enum lru_list lru, int nid) { int zid; struct zone *zone; if (lru != LRU_UNEVICTABLE) list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]); for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); } } void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) { enum lru_list lru; struct lruvec *child_lruvec, *parent_lruvec; child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); parent_lruvec->anon_cost += child_lruvec->anon_cost; parent_lruvec->file_cost += child_lruvec->file_cost; for_each_lru(lru) lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid); } #endif static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&page_cluster_max, } }; /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ register_sysctl_init("vm", swap_sysctl_table); } |
| 134 54 54 11 11 11 11 11 11 11 6 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2025 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> */ #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/mempool.h> #include <linux/fserror.h> #define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) static struct mempool fserror_events_pool; void fserror_mount(struct super_block *sb) { /* * The pending error counter is biased by 1 so that we don't wake_var * until we're actually trying to unmount. */ refcount_set(&sb->s_pending_errors, 1); } void fserror_unmount(struct super_block *sb) { /* * If we don't drop the pending error count to zero, then wait for it * to drop below 1, which means that the pending errors cleared and * hopefully we didn't saturate with 1 billion+ concurrent events. */ if (!refcount_dec_and_test(&sb->s_pending_errors)) wait_var_event(&sb->s_pending_errors, refcount_read(&sb->s_pending_errors) < 1); } static inline void fserror_pending_dec(struct super_block *sb) { if (refcount_dec_and_test(&sb->s_pending_errors)) wake_up_var(&sb->s_pending_errors); } static inline void fserror_free_event(struct fserror_event *event) { fserror_pending_dec(event->sb); mempool_free(event, &fserror_events_pool); } static void fserror_worker(struct work_struct *work) { struct fserror_event *event = container_of(work, struct fserror_event, work); struct super_block *sb = event->sb; if (sb->s_flags & SB_ACTIVE) { struct fs_error_report report = { /* send positive error number to userspace */ .error = -event->error, .inode = event->inode, .sb = event->sb, }; if (sb->s_op->report_error) sb->s_op->report_error(event); fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, NULL, 0); } iput(event->inode); fserror_free_event(event); } static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, gfp_t gfp_flags) { struct fserror_event *event = NULL; /* * If pending_errors already reached zero or is no longer active, * the superblock is being deactivated so there's no point in * continuing. * * The order of the check of s_pending_errors and SB_ACTIVE are * mandated by order of accesses in generic_shutdown_super and * fserror_unmount. Barriers are implicitly provided by the refcount * manipulations in this function and fserror_unmount. */ if (!refcount_inc_not_zero(&sb->s_pending_errors)) return NULL; if (!(sb->s_flags & SB_ACTIVE)) goto out_pending; event = mempool_alloc(&fserror_events_pool, gfp_flags); if (!event) goto out_pending; /* mempool_alloc doesn't support GFP_ZERO */ memset(event, 0, sizeof(*event)); event->sb = sb; INIT_WORK(&event->work, fserror_worker); return event; out_pending: fserror_pending_dec(sb); return NULL; } /** * fserror_report - report a filesystem error of some kind * * @sb: superblock of the filesystem * @inode: inode within that filesystem, if applicable * @type: type of error encountered * @pos: start of inode range affected, if applicable * @len: length of inode range affected, if applicable * @error: error number encountered, must be negative * @gfp: memory allocation flags for conveying the event to a worker, * since this function can be called from atomic contexts * * Report details of a filesystem error to the super_operations::report_error * callback if present; and to fsnotify for distribution to userspace. @sb, * @gfp, @type, and @error must all be specified. For file I/O errors, the * @inode, @pos, and @len fields must also be specified. For file metadata * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb * must point to @sb. * * Reporting work is deferred to a workqueue to ensure that ->report_error is * called from process context without any locks held. An active reference to * the inode is maintained until event handling is complete, and unmount will * wait for queued events to drain. */ void fserror_report(struct super_block *sb, struct inode *inode, enum fserror_type type, loff_t pos, u64 len, int error, gfp_t gfp) { struct fserror_event *event; /* sb and inode must be from the same filesystem */ WARN_ON_ONCE(inode && inode->i_sb != sb); /* error number must be negative */ WARN_ON_ONCE(error >= 0); event = fserror_alloc_event(sb, gfp); if (!event) goto lost; event->type = type; event->pos = pos; event->len = len; event->error = error; /* * Can't iput from non-sleeping context, so grabbing another reference * to the inode must be the last thing before submitting the event. */ if (inode) { event->inode = igrab(inode); if (!event->inode) goto lost_event; } /* * Use schedule_work here even if we're already in process context so * that fsnotify and super_operations::report_error implementations are * guaranteed to run in process context without any locks held. Since * errors are supposed to be rare, the overhead shouldn't kill us any * more than the failing device will. */ schedule_work(&event->work); return; lost_event: fserror_free_event(event); lost: if (inode) pr_err_ratelimited( "%s: lost file I/O error report for ino %llu type %u pos 0x%llx len 0x%llx error %d", sb->s_id, inode->i_ino, type, pos, len, error); else pr_err_ratelimited( "%s: lost filesystem error report for type %u error %d", sb->s_id, type, error); } EXPORT_SYMBOL_GPL(fserror_report); static int __init fserror_init(void) { return mempool_init_kmalloc_pool(&fserror_events_pool, FSERROR_DEFAULT_EVENT_POOL_SIZE, sizeof(struct fserror_event)); } fs_initcall(fserror_init); |
| 176 274 1835 1969 40 43 49 54 291 303 318 331 1940 1951 1964 1970 175 634 44 103 54 11 28 55 47 1 1 1 1 665 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix - mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitrary value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += get_unaligned((u32 *)k); b += get_unaligned((u32 *)(k + 4)); c += get_unaligned((u32 *)(k + 8)); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitrary value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */ |
| 6 1 5 6 5 1 6 10 9 10 8 4 3 3 2 3 3 2 2 2 3 2 2 1 1 2 3 3 2 2 1 1 3 2 1 1 1 1 1 1 1 1 4 3 2 1 1 4 438 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 | // SPDX-License-Identifier: GPL-2.0-or-later /* GTP according to GSM TS 09.60 / 3GPP TS 29.060 * * (C) 2012-2014 by sysmocom - s.f.m.c. GmbH * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org> * * Author: Harald Welte <hwelte@sysmocom.de> * Pablo Neira Ayuso <pablo@netfilter.org> * Andreas Schultz <aschultz@travelping.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/rculist.h> #include <linux/jhash.h> #include <linux/if_tunnel.h> #include <linux/net.h> #include <linux/file.h> #include <linux/gtp.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <net/gtp.h> /* An active session for the subscriber. */ struct pdp_ctx { struct hlist_node hlist_tid; struct hlist_node hlist_addr; union { struct { u64 tid; u16 flow; } v0; struct { u32 i_tei; u32 o_tei; } v1; } u; u8 gtp_version; u16 af; union { struct in_addr addr; struct in6_addr addr6; } ms; union { struct in_addr addr; struct in6_addr addr6; } peer; struct sock *sk; struct net_device *dev; atomic_t tx_seq; struct rcu_head rcu_head; }; /* One instance of the GTP device. */ struct gtp_dev { struct list_head list; struct sock *sk0; struct sock *sk1u; u8 sk_created; struct net_device *dev; struct net *net; unsigned int role; unsigned int hash_size; struct hlist_head *tid_hash; struct hlist_head *addr_hash; u8 restart_count; }; struct echo_info { u16 af; u8 gtp_version; union { struct in_addr addr; } ms; union { struct in_addr addr; } peer; }; static unsigned int gtp_net_id __read_mostly; struct gtp_net { struct list_head gtp_dev_list; }; static u32 gtp_h_initval; static struct genl_family gtp_genl_family; enum gtp_multicast_groups { GTP_GENL_MCGRP, }; static const struct genl_multicast_group gtp_genl_mcgrps[] = { [GTP_GENL_MCGRP] = { .name = GTP_GENL_MCGRP_NAME }, }; static void pdp_context_delete(struct pdp_ctx *pctx); static inline u32 gtp0_hashfn(u64 tid) { u32 *tid32 = (u32 *) &tid; return jhash_2words(tid32[0], tid32[1], gtp_h_initval); } static inline u32 gtp1u_hashfn(u32 tid) { return jhash_1word(tid, gtp_h_initval); } static inline u32 ipv4_hashfn(__be32 ip) { return jhash_1word((__force u32)ip, gtp_h_initval); } static u32 ipv6_hashfn(const struct in6_addr *ip6) { return jhash_2words((__force u32)ip6->s6_addr32[0], (__force u32)ip6->s6_addr32[1], gtp_h_initval); } /* Resolve a PDP context structure based on the 64bit TID. */ static struct pdp_ctx *gtp0_pdp_find(struct gtp_dev *gtp, u64 tid, u16 family) { struct hlist_head *head; struct pdp_ctx *pdp; head = >p->tid_hash[gtp0_hashfn(tid) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_tid) { if (pdp->af == family && pdp->gtp_version == GTP_V0 && pdp->u.v0.tid == tid) return pdp; } return NULL; } /* Resolve a PDP context structure based on the 32bit TEI. */ static struct pdp_ctx *gtp1_pdp_find(struct gtp_dev *gtp, u32 tid, u16 family) { struct hlist_head *head; struct pdp_ctx *pdp; head = >p->tid_hash[gtp1u_hashfn(tid) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_tid) { if (pdp->af == family && pdp->gtp_version == GTP_V1 && pdp->u.v1.i_tei == tid) return pdp; } return NULL; } /* Resolve a PDP context based on IPv4 address of MS. */ static struct pdp_ctx *ipv4_pdp_find(struct gtp_dev *gtp, __be32 ms_addr) { struct hlist_head *head; struct pdp_ctx *pdp; head = >p->addr_hash[ipv4_hashfn(ms_addr) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_addr) { if (pdp->af == AF_INET && pdp->ms.addr.s_addr == ms_addr) return pdp; } return NULL; } /* 3GPP TS 29.060: PDN Connection: the association between a MS represented by * [...] one IPv6 *prefix* and a PDN represented by an APN. * * Then, 3GPP TS 29.061, Section 11.2.1.3 says: The size of the prefix shall be * according to the maximum prefix length for a global IPv6 address as * specified in the IPv6 Addressing Architecture, see RFC 4291. * * Finally, RFC 4291 section 2.5.4 states: All Global Unicast addresses other * than those that start with binary 000 have a 64-bit interface ID field * (i.e., n + m = 64). */ static bool ipv6_pdp_addr_equal(const struct in6_addr *a, const struct in6_addr *b) { return a->s6_addr32[0] == b->s6_addr32[0] && a->s6_addr32[1] == b->s6_addr32[1]; } static struct pdp_ctx *ipv6_pdp_find(struct gtp_dev *gtp, const struct in6_addr *ms_addr) { struct hlist_head *head; struct pdp_ctx *pdp; head = >p->addr_hash[ipv6_hashfn(ms_addr) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_addr) { if (pdp->af == AF_INET6 && ipv6_pdp_addr_equal(&pdp->ms.addr6, ms_addr)) return pdp; } return NULL; } static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx, unsigned int hdrlen, unsigned int role) { struct iphdr *iph; if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr))) return false; iph = (struct iphdr *)(skb->data + hdrlen); if (role == GTP_ROLE_SGSN) return iph->daddr == pctx->ms.addr.s_addr; else return iph->saddr == pctx->ms.addr.s_addr; } static bool gtp_check_ms_ipv6(struct sk_buff *skb, struct pdp_ctx *pctx, unsigned int hdrlen, unsigned int role) { struct ipv6hdr *ip6h; int ret; if (!pskb_may_pull(skb, hdrlen + sizeof(struct ipv6hdr))) return false; ip6h = (struct ipv6hdr *)(skb->data + hdrlen); if ((ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL) || (ipv6_addr_type(&ip6h->daddr) & IPV6_ADDR_LINKLOCAL)) return false; if (role == GTP_ROLE_SGSN) { ret = ipv6_pdp_addr_equal(&ip6h->daddr, &pctx->ms.addr6); } else { ret = ipv6_pdp_addr_equal(&ip6h->saddr, &pctx->ms.addr6); } return ret; } /* Check if the inner IP address in this packet is assigned to any * existing mobile subscriber. */ static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx, unsigned int hdrlen, unsigned int role, __u16 inner_proto) { switch (inner_proto) { case ETH_P_IP: return gtp_check_ms_ipv4(skb, pctx, hdrlen, role); case ETH_P_IPV6: return gtp_check_ms_ipv6(skb, pctx, hdrlen, role); } return false; } static int gtp_inner_proto(struct sk_buff *skb, unsigned int hdrlen, __u16 *inner_proto) { __u8 *ip_version, _ip_version; ip_version = skb_header_pointer(skb, hdrlen, sizeof(*ip_version), &_ip_version); if (!ip_version) return -1; switch (*ip_version & 0xf0) { case 0x40: *inner_proto = ETH_P_IP; break; case 0x60: *inner_proto = ETH_P_IPV6; break; default: return -1; } return 0; } static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, unsigned int hdrlen, unsigned int role, __u16 inner_proto) { if (!gtp_check_ms(skb, pctx, hdrlen, role, inner_proto)) { netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); return 1; } /* Get rid of the GTP + UDP headers. */ if (iptunnel_pull_header(skb, hdrlen, htons(inner_proto), !net_eq(sock_net(pctx->sk), dev_net(pctx->dev)))) { pctx->dev->stats.rx_length_errors++; goto err; } netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n"); /* Now that the UDP and the GTP header have been removed, set up the * new network header. This is required by the upper layer to * calculate the transport header. */ skb_reset_network_header(skb); skb_reset_mac_header(skb); skb->dev = pctx->dev; dev_sw_netstats_rx_add(pctx->dev, skb->len); __netif_rx(skb); return 0; err: pctx->dev->stats.rx_dropped++; return -1; } static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, const struct sock *sk, __be32 daddr, __be32 saddr) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_dscp = inet_sk_dscp(inet_sk(sk)); fl4->flowi4_scope = ip_sock_rt_scope(sk); fl4->flowi4_proto = sk->sk_protocol; return ip_route_output_key(sock_net(sk), fl4); } static struct rt6_info *ip6_route_output_gtp(struct net *net, struct flowi6 *fl6, const struct sock *sk, const struct in6_addr *daddr, struct in6_addr *saddr) { struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->daddr = *daddr; fl6->saddr = *saddr; fl6->flowi6_proto = sk->sk_protocol; dst = ip6_dst_lookup_flow(net, sk, fl6, NULL); if (IS_ERR(dst)) return ERR_PTR(-ENETUNREACH); return (struct rt6_info *)dst; } /* GSM TS 09.60. 7.3 * In all Path Management messages: * - TID: is not used and shall be set to 0. * - Flow Label is not used and shall be set to 0 * In signalling messages: * - number: this field is not yet used in signalling messages. * It shall be set to 255 by the sender and shall be ignored * by the receiver * Returns true if the echo req was correct, false otherwise. */ static bool gtp0_validate_echo_hdr(struct gtp0_header *gtp0) { return !(gtp0->tid || (gtp0->flags ^ 0x1e) || gtp0->number != 0xff || gtp0->flow); } /* msg_type has to be GTP_ECHO_REQ or GTP_ECHO_RSP */ static void gtp0_build_echo_msg(struct gtp0_header *hdr, __u8 msg_type) { int len_pkt, len_hdr; hdr->flags = 0x1e; /* v0, GTP-non-prime. */ hdr->type = msg_type; /* GSM TS 09.60. 7.3 In all Path Management Flow Label and TID * are not used and shall be set to 0. */ hdr->flow = 0; hdr->tid = 0; hdr->number = 0xff; hdr->spare[0] = 0xff; hdr->spare[1] = 0xff; hdr->spare[2] = 0xff; len_pkt = sizeof(struct gtp0_packet); len_hdr = sizeof(struct gtp0_header); if (msg_type == GTP_ECHO_RSP) hdr->length = htons(len_pkt - len_hdr); else hdr->length = 0; } static int gtp0_send_echo_resp_ip(struct gtp_dev *gtp, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4; struct rtable *rt; /* find route to the sender, * src address becomes dst address and vice versa. */ rt = ip4_route_output_gtp(&fl4, gtp->sk0, iph->saddr, iph->daddr); if (IS_ERR(rt)) { netdev_dbg(gtp->dev, "no route for echo response from %pI4\n", &iph->saddr); return -1; } udp_tunnel_xmit_skb(rt, gtp->sk0, skb, fl4.saddr, fl4.daddr, iph->tos, ip4_dst_hoplimit(&rt->dst), 0, htons(GTP0_PORT), htons(GTP0_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), false, 0); return 0; } static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp0_packet *gtp_pkt; struct gtp0_header *gtp0; __be16 seq; gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr)); if (!gtp0_validate_echo_hdr(gtp0)) return -1; seq = gtp0->seq; /* pull GTP and UDP headers */ skb_pull_data(skb, sizeof(struct gtp0_header) + sizeof(struct udphdr)); gtp_pkt = skb_push(skb, sizeof(struct gtp0_packet)); memset(gtp_pkt, 0, sizeof(struct gtp0_packet)); gtp0_build_echo_msg(>p_pkt->gtp0_h, GTP_ECHO_RSP); /* GSM TS 09.60. 7.3 The Sequence Number in a signalling response * message shall be copied from the signalling request message * that the GSN is replying to. */ gtp_pkt->gtp0_h.seq = seq; gtp_pkt->ie.tag = GTPIE_RECOVERY; gtp_pkt->ie.val = gtp->restart_count; switch (gtp->sk0->sk_family) { case AF_INET: if (gtp0_send_echo_resp_ip(gtp, skb) < 0) return -1; break; case AF_INET6: return -1; } return 0; } static int gtp_genl_fill_echo(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, int flags, u32 type, struct echo_info echo) { void *genlh; genlh = genlmsg_put(skb, snd_portid, snd_seq, >p_genl_family, flags, type); if (!genlh) goto failure; if (nla_put_u32(skb, GTPA_VERSION, echo.gtp_version) || nla_put_be32(skb, GTPA_PEER_ADDRESS, echo.peer.addr.s_addr) || nla_put_be32(skb, GTPA_MS_ADDRESS, echo.ms.addr.s_addr)) goto failure; genlmsg_end(skb, genlh); return 0; failure: genlmsg_cancel(skb, genlh); return -EMSGSIZE; } static void gtp0_handle_echo_resp_ip(struct sk_buff *skb, struct echo_info *echo) { struct iphdr *iph = ip_hdr(skb); echo->ms.addr.s_addr = iph->daddr; echo->peer.addr.s_addr = iph->saddr; echo->gtp_version = GTP_V0; } static int gtp0_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp0_header *gtp0; struct echo_info echo; struct sk_buff *msg; int ret; gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr)); if (!gtp0_validate_echo_hdr(gtp0)) return -1; switch (gtp->sk0->sk_family) { case AF_INET: gtp0_handle_echo_resp_ip(skb, &echo); break; case AF_INET6: return -1; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; ret = gtp_genl_fill_echo(msg, 0, 0, 0, GTP_CMD_ECHOREQ, echo); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(>p_genl_family, dev_net(gtp->dev), msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC); } static int gtp_proto_to_family(__u16 proto) { switch (proto) { case ETH_P_IP: return AF_INET; case ETH_P_IPV6: return AF_INET6; default: WARN_ON_ONCE(1); break; } return AF_UNSPEC; } /* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) { unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp0_header); struct gtp0_header *gtp0; struct pdp_ctx *pctx; __u16 inner_proto; if (!pskb_may_pull(skb, hdrlen)) return -1; gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr)); if ((gtp0->flags >> 5) != GTP_V0) return 1; /* If the sockets were created in kernel, it means that * there is no daemon running in userspace which would * handle echo request. */ if (gtp0->type == GTP_ECHO_REQ && gtp->sk_created) return gtp0_send_echo_resp(gtp, skb); if (gtp0->type == GTP_ECHO_RSP && gtp->sk_created) return gtp0_handle_echo_resp(gtp, skb); if (gtp0->type != GTP_TPDU) return 1; if (gtp_inner_proto(skb, hdrlen, &inner_proto) < 0) { netdev_dbg(gtp->dev, "GTP packet does not encapsulate an IP packet\n"); return -1; } pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid), gtp_proto_to_family(inner_proto)); if (!pctx) { netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); return 1; } return gtp_rx(pctx, skb, hdrlen, gtp->role, inner_proto); } /* msg_type has to be GTP_ECHO_REQ or GTP_ECHO_RSP */ static void gtp1u_build_echo_msg(struct gtp1_header_long *hdr, __u8 msg_type) { int len_pkt, len_hdr; /* S flag must be set to 1 */ hdr->flags = 0x32; /* v1, GTP-non-prime. */ hdr->type = msg_type; /* 3GPP TS 29.281 5.1 - TEID has to be set to 0 */ hdr->tid = 0; /* seq, npdu and next should be counted to the length of the GTP packet * that's why size of gtp1_header should be subtracted, * not size of gtp1_header_long. */ len_hdr = sizeof(struct gtp1_header); if (msg_type == GTP_ECHO_RSP) { len_pkt = sizeof(struct gtp1u_packet); hdr->length = htons(len_pkt - len_hdr); } else { /* GTP_ECHO_REQ does not carry GTP Information Element, * the why gtp1_header_long is used here. */ len_pkt = sizeof(struct gtp1_header_long); hdr->length = htons(len_pkt - len_hdr); } } static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp1_header_long *gtp1u; struct gtp1u_packet *gtp_pkt; struct rtable *rt; struct flowi4 fl4; struct iphdr *iph; gtp1u = (struct gtp1_header_long *)(skb->data + sizeof(struct udphdr)); /* 3GPP TS 29.281 5.1 - For the Echo Request, Echo Response, * Error Indication and Supported Extension Headers Notification * messages, the S flag shall be set to 1 and TEID shall be set to 0. */ if (!(gtp1u->flags & GTP1_F_SEQ) || gtp1u->tid) return -1; /* pull GTP and UDP headers */ skb_pull_data(skb, sizeof(struct gtp1_header_long) + sizeof(struct udphdr)); gtp_pkt = skb_push(skb, sizeof(struct gtp1u_packet)); memset(gtp_pkt, 0, sizeof(struct gtp1u_packet)); gtp1u_build_echo_msg(>p_pkt->gtp1u_h, GTP_ECHO_RSP); /* 3GPP TS 29.281 7.7.2 - The Restart Counter value in the * Recovery information element shall not be used, i.e. it shall * be set to zero by the sender and shall be ignored by the receiver. * The Recovery information element is mandatory due to backwards * compatibility reasons. */ gtp_pkt->ie.tag = GTPIE_RECOVERY; gtp_pkt->ie.val = 0; iph = ip_hdr(skb); /* find route to the sender, * src address becomes dst address and vice versa. */ rt = ip4_route_output_gtp(&fl4, gtp->sk1u, iph->saddr, iph->daddr); if (IS_ERR(rt)) { netdev_dbg(gtp->dev, "no route for echo response from %pI4\n", &iph->saddr); return -1; } udp_tunnel_xmit_skb(rt, gtp->sk1u, skb, fl4.saddr, fl4.daddr, iph->tos, ip4_dst_hoplimit(&rt->dst), 0, htons(GTP1U_PORT), htons(GTP1U_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), false, 0); return 0; } static int gtp1u_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp1_header_long *gtp1u; struct echo_info echo; struct sk_buff *msg; struct iphdr *iph; int ret; gtp1u = (struct gtp1_header_long *)(skb->data + sizeof(struct udphdr)); /* 3GPP TS 29.281 5.1 - For the Echo Request, Echo Response, * Error Indication and Supported Extension Headers Notification * messages, the S flag shall be set to 1 and TEID shall be set to 0. */ if (!(gtp1u->flags & GTP1_F_SEQ) || gtp1u->tid) return -1; iph = ip_hdr(skb); echo.ms.addr.s_addr = iph->daddr; echo.peer.addr.s_addr = iph->saddr; echo.gtp_version = GTP_V1; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; ret = gtp_genl_fill_echo(msg, 0, 0, 0, GTP_CMD_ECHOREQ, echo); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(>p_genl_family, dev_net(gtp->dev), msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC); } static int gtp_parse_exthdrs(struct sk_buff *skb, unsigned int *hdrlen) { struct gtp_ext_hdr *gtp_exthdr, _gtp_exthdr; unsigned int offset = *hdrlen; __u8 *next_type, _next_type; /* From 29.060: "The Extension Header Length field specifies the length * of the particular Extension header in 4 octets units." * * This length field includes length field size itself (1 byte), * payload (variable length) and next type (1 byte). The extension * header is aligned to to 4 bytes. */ do { gtp_exthdr = skb_header_pointer(skb, offset, sizeof(*gtp_exthdr), &_gtp_exthdr); if (!gtp_exthdr || !gtp_exthdr->len) return -1; offset += gtp_exthdr->len * 4; /* From 29.060: "If no such Header follows, then the value of * the Next Extension Header Type shall be 0." */ next_type = skb_header_pointer(skb, offset - 1, sizeof(_next_type), &_next_type); if (!next_type) return -1; } while (*next_type != 0); *hdrlen = offset; return 0; } static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) { unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp1_header); struct gtp1_header *gtp1; struct pdp_ctx *pctx; __u16 inner_proto; if (!pskb_may_pull(skb, hdrlen)) return -1; gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); if ((gtp1->flags >> 5) != GTP_V1) return 1; /* If the sockets were created in kernel, it means that * there is no daemon running in userspace which would * handle echo request. */ if (gtp1->type == GTP_ECHO_REQ && gtp->sk_created) return gtp1u_send_echo_resp(gtp, skb); if (gtp1->type == GTP_ECHO_RSP && gtp->sk_created) return gtp1u_handle_echo_resp(gtp, skb); if (gtp1->type != GTP_TPDU) return 1; /* From 29.060: "This field shall be present if and only if any one or * more of the S, PN and E flags are set.". * * If any of the bit is set, then the remaining ones also have to be * set. */ if (gtp1->flags & GTP1_F_MASK) hdrlen += 4; /* Make sure the header is larger enough, including extensions. */ if (!pskb_may_pull(skb, hdrlen)) return -1; if (gtp_inner_proto(skb, hdrlen, &inner_proto) < 0) { netdev_dbg(gtp->dev, "GTP packet does not encapsulate an IP packet\n"); return -1; } gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid), gtp_proto_to_family(inner_proto)); if (!pctx) { netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); return 1; } if (gtp1->flags & GTP1_F_EXTHDR && gtp_parse_exthdrs(skb, &hdrlen) < 0) return -1; return gtp_rx(pctx, skb, hdrlen, gtp->role, inner_proto); } static void __gtp_encap_destroy(struct sock *sk) { struct gtp_dev *gtp; lock_sock(sk); gtp = sk->sk_user_data; if (gtp) { if (gtp->sk0 == sk) gtp->sk0 = NULL; else gtp->sk1u = NULL; WRITE_ONCE(udp_sk(sk)->encap_type, 0); rcu_assign_sk_user_data(sk, NULL); release_sock(sk); sock_put(sk); return; } release_sock(sk); } static void gtp_encap_destroy(struct sock *sk) { rtnl_lock(); __gtp_encap_destroy(sk); rtnl_unlock(); } static void gtp_encap_disable_sock(struct sock *sk) { if (!sk) return; __gtp_encap_destroy(sk); } static void gtp_encap_disable(struct gtp_dev *gtp) { if (gtp->sk_created) { udp_tunnel_sock_release(gtp->sk0->sk_socket); udp_tunnel_sock_release(gtp->sk1u->sk_socket); gtp->sk_created = false; gtp->sk0 = NULL; gtp->sk1u = NULL; } else { gtp_encap_disable_sock(gtp->sk0); gtp_encap_disable_sock(gtp->sk1u); } } /* UDP encapsulation receive handler. See net/ipv4/udp.c. * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket. */ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct gtp_dev *gtp; int ret = 0; gtp = rcu_dereference_sk_user_data(sk); if (!gtp) return 1; netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk); switch (READ_ONCE(udp_sk(sk)->encap_type)) { case UDP_ENCAP_GTP0: netdev_dbg(gtp->dev, "received GTP0 packet\n"); ret = gtp0_udp_encap_recv(gtp, skb); break; case UDP_ENCAP_GTP1U: netdev_dbg(gtp->dev, "received GTP1U packet\n"); ret = gtp1u_udp_encap_recv(gtp, skb); break; default: ret = -1; /* Shouldn't happen. */ } switch (ret) { case 1: netdev_dbg(gtp->dev, "pass up to the process\n"); break; case 0: break; case -1: netdev_dbg(gtp->dev, "GTP packet has been dropped\n"); kfree_skb(skb); ret = 0; break; } return ret; } static void gtp_dev_uninit(struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); gtp_encap_disable(gtp); } static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) { int payload_len = skb->len; struct gtp0_header *gtp0; gtp0 = skb_push(skb, sizeof(*gtp0)); gtp0->flags = 0x1e; /* v0, GTP-non-prime. */ gtp0->type = GTP_TPDU; gtp0->length = htons(payload_len); gtp0->seq = htons((atomic_inc_return(&pctx->tx_seq) - 1) % 0xffff); gtp0->flow = htons(pctx->u.v0.flow); gtp0->number = 0xff; gtp0->spare[0] = gtp0->spare[1] = gtp0->spare[2] = 0xff; gtp0->tid = cpu_to_be64(pctx->u.v0.tid); } static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) { int payload_len = skb->len; struct gtp1_header *gtp1; gtp1 = skb_push(skb, sizeof(*gtp1)); /* Bits 8 7 6 5 4 3 2 1 * +--+--+--+--+--+--+--+--+ * |version |PT| 0| E| S|PN| * +--+--+--+--+--+--+--+--+ * 0 0 1 1 1 0 0 0 */ gtp1->flags = 0x30; /* v1, GTP-non-prime. */ gtp1->type = GTP_TPDU; gtp1->length = htons(payload_len); gtp1->tid = htonl(pctx->u.v1.o_tei); /* TODO: Support for extension header, sequence number and N-PDU. * Update the length field if any of them is available. */ } struct gtp_pktinfo { struct sock *sk; union { struct flowi4 fl4; struct flowi6 fl6; }; union { struct rtable *rt; struct rt6_info *rt6; }; struct pdp_ctx *pctx; struct net_device *dev; __u8 tos; __be16 gtph_port; }; static void gtp_push_header(struct sk_buff *skb, struct gtp_pktinfo *pktinfo) { switch (pktinfo->pctx->gtp_version) { case GTP_V0: pktinfo->gtph_port = htons(GTP0_PORT); gtp0_push_header(skb, pktinfo->pctx); break; case GTP_V1: pktinfo->gtph_port = htons(GTP1U_PORT); gtp1_push_header(skb, pktinfo->pctx); break; } } static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo, struct sock *sk, __u8 tos, struct pdp_ctx *pctx, struct rtable *rt, struct flowi4 *fl4, struct net_device *dev) { pktinfo->sk = sk; pktinfo->tos = tos; pktinfo->pctx = pctx; pktinfo->rt = rt; pktinfo->fl4 = *fl4; pktinfo->dev = dev; } static void gtp_set_pktinfo_ipv6(struct gtp_pktinfo *pktinfo, struct sock *sk, __u8 tos, struct pdp_ctx *pctx, struct rt6_info *rt6, struct flowi6 *fl6, struct net_device *dev) { pktinfo->sk = sk; pktinfo->tos = tos; pktinfo->pctx = pctx; pktinfo->rt6 = rt6; pktinfo->fl6 = *fl6; pktinfo->dev = dev; } static int gtp_build_skb_outer_ip4(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo, struct pdp_ctx *pctx, __u8 tos, __be16 frag_off) { struct rtable *rt; struct flowi4 fl4; __be16 df; int mtu; rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer.addr.s_addr, inet_sk(pctx->sk)->inet_saddr); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to SSGN %pI4\n", &pctx->peer.addr.s_addr); dev->stats.tx_carrier_errors++; goto err; } if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to SSGN %pI4\n", &pctx->peer.addr.s_addr); dev->stats.collisions++; goto err_rt; } /* This is similar to tnl_update_pmtu(). */ df = frag_off; if (df) { mtu = dst_mtu(&rt->dst) - dev->hard_header_len - sizeof(struct iphdr) - sizeof(struct udphdr); switch (pctx->gtp_version) { case GTP_V0: mtu -= sizeof(struct gtp0_header); break; case GTP_V1: mtu -= sizeof(struct gtp1_header); break; } } else { mtu = dst_mtu(&rt->dst); } skb_dst_update_pmtu_no_confirm(skb, mtu); if (frag_off & htons(IP_DF) && ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)))) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto err_rt; } gtp_set_pktinfo_ipv4(pktinfo, pctx->sk, tos, pctx, rt, &fl4, dev); gtp_push_header(skb, pktinfo); return 0; err_rt: ip_rt_put(rt); err: return -EBADMSG; } static int gtp_build_skb_outer_ip6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo, struct pdp_ctx *pctx, __u8 tos) { struct dst_entry *dst; struct rt6_info *rt; struct flowi6 fl6; int mtu; rt = ip6_route_output_gtp(net, &fl6, pctx->sk, &pctx->peer.addr6, &inet6_sk(pctx->sk)->saddr); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to SSGN %pI6\n", &pctx->peer.addr6); dev->stats.tx_carrier_errors++; goto err; } dst = &rt->dst; if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to SSGN %pI6\n", &pctx->peer.addr6); dev->stats.collisions++; goto err_rt; } mtu = dst_mtu(&rt->dst) - dev->hard_header_len - sizeof(struct ipv6hdr) - sizeof(struct udphdr); switch (pctx->gtp_version) { case GTP_V0: mtu -= sizeof(struct gtp0_header); break; case GTP_V1: mtu -= sizeof(struct gtp1_header); break; } skb_dst_update_pmtu_no_confirm(skb, mtu); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); goto err_rt; } gtp_set_pktinfo_ipv6(pktinfo, pctx->sk, tos, pctx, rt, &fl6, dev); gtp_push_header(skb, pktinfo); return 0; err_rt: dst_release(dst); err: return -EBADMSG; } static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo) { struct gtp_dev *gtp = netdev_priv(dev); struct net *net = gtp->net; struct pdp_ctx *pctx; struct iphdr *iph; int ret; /* Read the IP destination address and resolve the PDP context. * Prepend PDP header with TEI/TID from PDP ctx. */ iph = ip_hdr(skb); if (gtp->role == GTP_ROLE_SGSN) pctx = ipv4_pdp_find(gtp, iph->saddr); else pctx = ipv4_pdp_find(gtp, iph->daddr); if (!pctx) { netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", &iph->daddr); return -ENOENT; } netdev_dbg(dev, "found PDP context %p\n", pctx); switch (pctx->sk->sk_family) { case AF_INET: ret = gtp_build_skb_outer_ip4(skb, dev, pktinfo, pctx, iph->tos, iph->frag_off); break; case AF_INET6: ret = gtp_build_skb_outer_ip6(net, skb, dev, pktinfo, pctx, iph->tos); break; default: ret = -1; WARN_ON_ONCE(1); break; } if (ret < 0) return ret; netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n", &iph->saddr, &iph->daddr); return 0; } static int gtp_build_skb_ip6(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo) { struct gtp_dev *gtp = netdev_priv(dev); struct net *net = gtp->net; struct pdp_ctx *pctx; struct ipv6hdr *ip6h; __u8 tos; int ret; /* Read the IP destination address and resolve the PDP context. * Prepend PDP header with TEI/TID from PDP ctx. */ ip6h = ipv6_hdr(skb); if (gtp->role == GTP_ROLE_SGSN) pctx = ipv6_pdp_find(gtp, &ip6h->saddr); else pctx = ipv6_pdp_find(gtp, &ip6h->daddr); if (!pctx) { netdev_dbg(dev, "no PDP ctx found for %pI6, skip\n", &ip6h->daddr); return -ENOENT; } netdev_dbg(dev, "found PDP context %p\n", pctx); tos = ipv6_get_dsfield(ip6h); switch (pctx->sk->sk_family) { case AF_INET: ret = gtp_build_skb_outer_ip4(skb, dev, pktinfo, pctx, tos, 0); break; case AF_INET6: ret = gtp_build_skb_outer_ip6(net, skb, dev, pktinfo, pctx, tos); break; default: ret = -1; WARN_ON_ONCE(1); break; } if (ret < 0) return ret; netdev_dbg(dev, "gtp -> IP src: %pI6 dst: %pI6\n", &ip6h->saddr, &ip6h->daddr); return 0; } static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int proto = ntohs(skb->protocol); struct gtp_pktinfo pktinfo; int err; /* Ensure there is sufficient headroom. */ if (skb_cow_head(skb, dev->needed_headroom)) goto tx_err; if (!pskb_inet_may_pull(skb)) goto tx_err; skb_reset_inner_headers(skb); /* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */ rcu_read_lock(); switch (proto) { case ETH_P_IP: err = gtp_build_skb_ip4(skb, dev, &pktinfo); break; case ETH_P_IPV6: err = gtp_build_skb_ip6(skb, dev, &pktinfo); break; default: err = -EOPNOTSUPP; break; } rcu_read_unlock(); if (err < 0) goto tx_err; switch (pktinfo.pctx->sk->sk_family) { case AF_INET: udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb, pktinfo.fl4.saddr, pktinfo.fl4.daddr, pktinfo.tos, ip4_dst_hoplimit(&pktinfo.rt->dst), 0, pktinfo.gtph_port, pktinfo.gtph_port, !net_eq(sock_net(pktinfo.pctx->sk), dev_net(dev)), false, 0); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) udp_tunnel6_xmit_skb(&pktinfo.rt6->dst, pktinfo.sk, skb, dev, &pktinfo.fl6.saddr, &pktinfo.fl6.daddr, pktinfo.tos, ip6_dst_hoplimit(&pktinfo.rt->dst), 0, pktinfo.gtph_port, pktinfo.gtph_port, false, 0); #else goto tx_err; #endif break; } return NETDEV_TX_OK; tx_err: dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } static const struct net_device_ops gtp_netdev_ops = { .ndo_uninit = gtp_dev_uninit, .ndo_start_xmit = gtp_dev_xmit, }; static const struct device_type gtp_type = { .name = "gtp", }; #define GTP_TH_MAXLEN (sizeof(struct udphdr) + sizeof(struct gtp0_header)) #define GTP_IPV4_MAXLEN (sizeof(struct iphdr) + GTP_TH_MAXLEN) static void gtp_link_setup(struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); dev->netdev_ops = >p_netdev_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, >p_type); dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN - GTP_IPV4_MAXLEN; /* Zero header length. */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; netif_keep_dst(dev); dev->needed_headroom = LL_MAX_HEADER + GTP_IPV4_MAXLEN; gtp->dev = dev; } static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize); static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]); static void gtp_destructor(struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); kfree(gtp->addr_hash); kfree(gtp->tid_hash); } static int gtp_sock_udp_config(struct udp_port_cfg *udp_conf, const struct nlattr *nla, int family) { udp_conf->family = family; switch (udp_conf->family) { case AF_INET: udp_conf->local_ip.s_addr = nla_get_be32(nla); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: udp_conf->local_ip6 = nla_get_in6_addr(nla); break; #endif default: return -EOPNOTSUPP; } return 0; } static struct sock *gtp_create_sock(int type, struct gtp_dev *gtp, const struct nlattr *nla, int family) { struct udp_tunnel_sock_cfg tuncfg = {}; struct udp_port_cfg udp_conf = {}; struct net *net = gtp->net; struct socket *sock; int err; if (nla) { err = gtp_sock_udp_config(&udp_conf, nla, family); if (err < 0) return ERR_PTR(err); } else { udp_conf.local_ip.s_addr = htonl(INADDR_ANY); udp_conf.family = AF_INET; } if (type == UDP_ENCAP_GTP0) udp_conf.local_udp_port = htons(GTP0_PORT); else if (type == UDP_ENCAP_GTP1U) udp_conf.local_udp_port = htons(GTP1U_PORT); else return ERR_PTR(-EINVAL); err = udp_sock_create(net, &udp_conf, &sock); if (err) return ERR_PTR(err); tuncfg.sk_user_data = gtp; tuncfg.encap_type = type; tuncfg.encap_rcv = gtp_encap_recv; tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tuncfg); return sock->sk; } static int gtp_create_sockets(struct gtp_dev *gtp, const struct nlattr *nla, int family) { struct sock *sk1u; struct sock *sk0; sk0 = gtp_create_sock(UDP_ENCAP_GTP0, gtp, nla, family); if (IS_ERR(sk0)) return PTR_ERR(sk0); sk1u = gtp_create_sock(UDP_ENCAP_GTP1U, gtp, nla, family); if (IS_ERR(sk1u)) { udp_tunnel_sock_release(sk0->sk_socket); return PTR_ERR(sk1u); } gtp->sk_created = true; gtp->sk0 = sk0; gtp->sk1u = sk1u; return 0; } #define GTP_TH_MAXLEN (sizeof(struct udphdr) + sizeof(struct gtp0_header)) #define GTP_IPV6_MAXLEN (sizeof(struct ipv6hdr) + GTP_TH_MAXLEN) static int gtp_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; unsigned int role = GTP_ROLE_GGSN; struct gtp_dev *gtp; struct gtp_net *gn; int hashsize, err; #if !IS_ENABLED(CONFIG_IPV6) if (data[IFLA_GTP_LOCAL6]) return -EAFNOSUPPORT; #endif gtp = netdev_priv(dev); if (!data[IFLA_GTP_PDP_HASHSIZE]) { hashsize = 1024; } else { hashsize = nla_get_u32(data[IFLA_GTP_PDP_HASHSIZE]); if (!hashsize) hashsize = 1024; } if (data[IFLA_GTP_ROLE]) { role = nla_get_u32(data[IFLA_GTP_ROLE]); if (role > GTP_ROLE_SGSN) return -EINVAL; } gtp->role = role; gtp->restart_count = nla_get_u8_default(data[IFLA_GTP_RESTART_COUNT], 0); gtp->net = link_net; err = gtp_hashtable_new(gtp, hashsize); if (err < 0) return err; if (data[IFLA_GTP_CREATE_SOCKETS]) { if (data[IFLA_GTP_LOCAL6]) err = gtp_create_sockets(gtp, data[IFLA_GTP_LOCAL6], AF_INET6); else err = gtp_create_sockets(gtp, data[IFLA_GTP_LOCAL], AF_INET); } else { err = gtp_encap_enable(gtp, data); } if (err < 0) goto out_hashtable; if ((gtp->sk0 && gtp->sk0->sk_family == AF_INET6) || (gtp->sk1u && gtp->sk1u->sk_family == AF_INET6)) { dev->mtu = ETH_DATA_LEN - GTP_IPV6_MAXLEN; dev->needed_headroom = LL_MAX_HEADER + GTP_IPV6_MAXLEN; } err = register_netdevice(dev); if (err < 0) { netdev_dbg(dev, "failed to register new netdev %d\n", err); goto out_encap; } gn = net_generic(link_net, gtp_net_id); list_add(>p->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; netdev_dbg(dev, "registered new GTP interface\n"); return 0; out_encap: gtp_encap_disable(gtp); out_hashtable: kfree(gtp->addr_hash); kfree(gtp->tid_hash); return err; } static void gtp_dellink(struct net_device *dev, struct list_head *head) { struct gtp_dev *gtp = netdev_priv(dev); struct hlist_node *next; struct pdp_ctx *pctx; int i; for (i = 0; i < gtp->hash_size; i++) hlist_for_each_entry_safe(pctx, next, >p->tid_hash[i], hlist_tid) pdp_context_delete(pctx); list_del(>p->list); unregister_netdevice_queue(dev, head); } static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = { [IFLA_GTP_FD0] = { .type = NLA_U32 }, [IFLA_GTP_FD1] = { .type = NLA_U32 }, [IFLA_GTP_PDP_HASHSIZE] = { .type = NLA_U32 }, [IFLA_GTP_ROLE] = { .type = NLA_U32 }, [IFLA_GTP_CREATE_SOCKETS] = { .type = NLA_U8 }, [IFLA_GTP_RESTART_COUNT] = { .type = NLA_U8 }, [IFLA_GTP_LOCAL] = { .type = NLA_U32 }, [IFLA_GTP_LOCAL6] = { .len = sizeof(struct in6_addr) }, }; static int gtp_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) return -EINVAL; return 0; } static size_t gtp_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_GTP_PDP_HASHSIZE */ nla_total_size(sizeof(__u32)) + /* IFLA_GTP_ROLE */ nla_total_size(sizeof(__u8)); /* IFLA_GTP_RESTART_COUNT */ } static int gtp_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); if (nla_put_u32(skb, IFLA_GTP_PDP_HASHSIZE, gtp->hash_size)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_GTP_ROLE, gtp->role)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GTP_RESTART_COUNT, gtp->restart_count)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops gtp_link_ops __read_mostly = { .kind = "gtp", .maxtype = IFLA_GTP_MAX, .policy = gtp_policy, .priv_size = sizeof(struct gtp_dev), .setup = gtp_link_setup, .validate = gtp_validate, .newlink = gtp_newlink, .dellink = gtp_dellink, .get_size = gtp_get_size, .fill_info = gtp_fill_info, }; static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize) { int i; gtp->addr_hash = kmalloc_objs(struct hlist_head, hsize, GFP_KERNEL | __GFP_NOWARN); if (gtp->addr_hash == NULL) return -ENOMEM; gtp->tid_hash = kmalloc_objs(struct hlist_head, hsize, GFP_KERNEL | __GFP_NOWARN); if (gtp->tid_hash == NULL) goto err1; gtp->hash_size = hsize; for (i = 0; i < hsize; i++) { INIT_HLIST_HEAD(>p->addr_hash[i]); INIT_HLIST_HEAD(>p->tid_hash[i]); } return 0; err1: kfree(gtp->addr_hash); return -ENOMEM; } static struct sock *gtp_encap_enable_socket(int fd, int type, struct gtp_dev *gtp) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct socket *sock; struct sock *sk; int err; pr_debug("enable gtp on %d, %d\n", fd, type); sock = sockfd_lookup(fd, &err); if (!sock) { pr_debug("gtp socket fd=%d not found\n", fd); return ERR_PTR(err); } sk = sock->sk; if (sk->sk_protocol != IPPROTO_UDP || sk->sk_type != SOCK_DGRAM || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { pr_debug("socket fd=%d not UDP\n", fd); sk = ERR_PTR(-EINVAL); goto out_sock; } if (sk->sk_family == AF_INET6 && !sk->sk_ipv6only) { sk = ERR_PTR(-EADDRNOTAVAIL); goto out_sock; } lock_sock(sk); if (sk->sk_user_data) { sk = ERR_PTR(-EBUSY); goto out_rel_sock; } sock_hold(sk); tuncfg.sk_user_data = gtp; tuncfg.encap_type = type; tuncfg.encap_rcv = gtp_encap_recv; tuncfg.encap_destroy = gtp_encap_destroy; setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg); out_rel_sock: release_sock(sock->sk); out_sock: sockfd_put(sock); return sk; } static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) { struct sock *sk1u = NULL; struct sock *sk0 = NULL; if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1]) return -EINVAL; if (data[IFLA_GTP_FD0]) { int fd0 = nla_get_u32(data[IFLA_GTP_FD0]); if (fd0 >= 0) { sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp); if (IS_ERR(sk0)) return PTR_ERR(sk0); } } if (data[IFLA_GTP_FD1]) { int fd1 = nla_get_u32(data[IFLA_GTP_FD1]); if (fd1 >= 0) { sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp); if (IS_ERR(sk1u)) { gtp_encap_disable_sock(sk0); return PTR_ERR(sk1u); } } } gtp->sk0 = sk0; gtp->sk1u = sk1u; if (sk0 && sk1u && sk0->sk_family != sk1u->sk_family) { gtp_encap_disable_sock(sk0); gtp_encap_disable_sock(sk1u); return -EINVAL; } return 0; } static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[]) { struct gtp_dev *gtp = NULL; struct net_device *dev; struct net *net; /* Examine the link attributes and figure out which network namespace * we are talking about. */ if (nla[GTPA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(nla[GTPA_NET_NS_FD])); else net = get_net(src_net); if (IS_ERR(net)) return NULL; /* Check if there's an existing gtpX device to configure */ dev = dev_get_by_index_rcu(net, nla_get_u32(nla[GTPA_LINK])); if (dev && dev->netdev_ops == >p_netdev_ops) gtp = netdev_priv(dev); put_net(net); return gtp; } static void gtp_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]); switch (pctx->gtp_version) { case GTP_V0: /* According to TS 09.60, sections 7.5.1 and 7.5.2, the flow * label needs to be the same for uplink and downlink packets, * so let's annotate this. */ pctx->u.v0.tid = nla_get_u64(info->attrs[GTPA_TID]); pctx->u.v0.flow = nla_get_u16(info->attrs[GTPA_FLOW]); break; case GTP_V1: pctx->u.v1.i_tei = nla_get_u32(info->attrs[GTPA_I_TEI]); pctx->u.v1.o_tei = nla_get_u32(info->attrs[GTPA_O_TEI]); break; default: break; } } static void ip_pdp_peer_fill(struct pdp_ctx *pctx, struct genl_info *info) { if (info->attrs[GTPA_PEER_ADDRESS]) { pctx->peer.addr.s_addr = nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]); } else if (info->attrs[GTPA_PEER_ADDR6]) { pctx->peer.addr6 = nla_get_in6_addr(info->attrs[GTPA_PEER_ADDR6]); } } static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { ip_pdp_peer_fill(pctx, info); pctx->ms.addr.s_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); gtp_pdp_fill(pctx, info); } static bool ipv6_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { ip_pdp_peer_fill(pctx, info); pctx->ms.addr6 = nla_get_in6_addr(info->attrs[GTPA_MS_ADDR6]); if (pctx->ms.addr6.s6_addr32[2] || pctx->ms.addr6.s6_addr32[3]) return false; gtp_pdp_fill(pctx, info); return true; } static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, struct genl_info *info) { struct pdp_ctx *pctx, *pctx_tid = NULL; struct net_device *dev = gtp->dev; u32 hash_ms, hash_tid = 0; struct in6_addr ms_addr6; unsigned int version; bool found = false; __be32 ms_addr; int family; version = nla_get_u32(info->attrs[GTPA_VERSION]); family = nla_get_u8_default(info->attrs[GTPA_FAMILY], AF_INET); #if !IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) return ERR_PTR(-EAFNOSUPPORT); #endif if (!info->attrs[GTPA_PEER_ADDRESS] && !info->attrs[GTPA_PEER_ADDR6]) return ERR_PTR(-EINVAL); if ((info->attrs[GTPA_PEER_ADDRESS] && sk->sk_family == AF_INET6) || (info->attrs[GTPA_PEER_ADDR6] && sk->sk_family == AF_INET)) return ERR_PTR(-EAFNOSUPPORT); switch (family) { case AF_INET: if (!info->attrs[GTPA_MS_ADDRESS] || info->attrs[GTPA_MS_ADDR6]) return ERR_PTR(-EINVAL); ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size; pctx = ipv4_pdp_find(gtp, ms_addr); break; case AF_INET6: if (!info->attrs[GTPA_MS_ADDR6] || info->attrs[GTPA_MS_ADDRESS]) return ERR_PTR(-EINVAL); ms_addr6 = nla_get_in6_addr(info->attrs[GTPA_MS_ADDR6]); hash_ms = ipv6_hashfn(&ms_addr6) % gtp->hash_size; pctx = ipv6_pdp_find(gtp, &ms_addr6); break; default: return ERR_PTR(-EAFNOSUPPORT); } if (pctx) found = true; if (version == GTP_V0) pctx_tid = gtp0_pdp_find(gtp, nla_get_u64(info->attrs[GTPA_TID]), family); else if (version == GTP_V1) pctx_tid = gtp1_pdp_find(gtp, nla_get_u32(info->attrs[GTPA_I_TEI]), family); if (pctx_tid) found = true; if (found) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) return ERR_PTR(-EEXIST); if (info->nlhdr->nlmsg_flags & NLM_F_REPLACE) return ERR_PTR(-EOPNOTSUPP); if (pctx && pctx_tid) return ERR_PTR(-EEXIST); if (!pctx) pctx = pctx_tid; switch (pctx->af) { case AF_INET: ipv4_pdp_fill(pctx, info); break; case AF_INET6: if (!ipv6_pdp_fill(pctx, info)) return ERR_PTR(-EADDRNOTAVAIL); break; } if (pctx->gtp_version == GTP_V0) netdev_dbg(dev, "GTPv0-U: update tunnel id = %llx (pdp %p)\n", pctx->u.v0.tid, pctx); else if (pctx->gtp_version == GTP_V1) netdev_dbg(dev, "GTPv1-U: update tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); return pctx; } pctx = kmalloc_obj(*pctx, GFP_ATOMIC); if (pctx == NULL) return ERR_PTR(-ENOMEM); sock_hold(sk); pctx->sk = sk; pctx->dev = gtp->dev; pctx->af = family; switch (pctx->af) { case AF_INET: if (!info->attrs[GTPA_MS_ADDRESS]) { sock_put(sk); kfree(pctx); return ERR_PTR(-EINVAL); } ipv4_pdp_fill(pctx, info); break; case AF_INET6: if (!info->attrs[GTPA_MS_ADDR6]) { sock_put(sk); kfree(pctx); return ERR_PTR(-EINVAL); } if (!ipv6_pdp_fill(pctx, info)) { sock_put(sk); kfree(pctx); return ERR_PTR(-EADDRNOTAVAIL); } break; } atomic_set(&pctx->tx_seq, 0); switch (pctx->gtp_version) { case GTP_V0: /* TS 09.60: "The flow label identifies unambiguously a GTP * flow.". We use the tid for this instead, I cannot find a * situation in which this doesn't unambiguosly identify the * PDP context. */ hash_tid = gtp0_hashfn(pctx->u.v0.tid) % gtp->hash_size; break; case GTP_V1: hash_tid = gtp1u_hashfn(pctx->u.v1.i_tei) % gtp->hash_size; break; } hlist_add_head_rcu(&pctx->hlist_addr, >p->addr_hash[hash_ms]); hlist_add_head_rcu(&pctx->hlist_tid, >p->tid_hash[hash_tid]); switch (pctx->gtp_version) { case GTP_V0: netdev_dbg(dev, "GTPv0-U: new PDP ctx id=%llx ssgn=%pI4 ms=%pI4 (pdp=%p)\n", pctx->u.v0.tid, &pctx->peer.addr, &pctx->ms.addr, pctx); break; case GTP_V1: netdev_dbg(dev, "GTPv1-U: new PDP ctx id=%x/%x ssgn=%pI4 ms=%pI4 (pdp=%p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, &pctx->peer.addr, &pctx->ms.addr, pctx); break; } return pctx; } static void pdp_context_free(struct rcu_head *head) { struct pdp_ctx *pctx = container_of(head, struct pdp_ctx, rcu_head); sock_put(pctx->sk); kfree(pctx); } static void pdp_context_delete(struct pdp_ctx *pctx) { hlist_del_rcu(&pctx->hlist_tid); hlist_del_rcu(&pctx->hlist_addr); call_rcu(&pctx->rcu_head, pdp_context_free); } static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd, gfp_t allocation); static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) { unsigned int version; struct pdp_ctx *pctx; struct gtp_dev *gtp; struct sock *sk; int err; if (!info->attrs[GTPA_VERSION] || !info->attrs[GTPA_LINK]) return -EINVAL; version = nla_get_u32(info->attrs[GTPA_VERSION]); switch (version) { case GTP_V0: if (!info->attrs[GTPA_TID] || !info->attrs[GTPA_FLOW]) return -EINVAL; break; case GTP_V1: if (!info->attrs[GTPA_I_TEI] || !info->attrs[GTPA_O_TEI]) return -EINVAL; break; default: return -EINVAL; } rtnl_lock(); gtp = gtp_find_dev(sock_net(skb->sk), info->attrs); if (!gtp) { err = -ENODEV; goto out_unlock; } if (version == GTP_V0) sk = gtp->sk0; else if (version == GTP_V1) sk = gtp->sk1u; else sk = NULL; if (!sk) { err = -ENODEV; goto out_unlock; } pctx = gtp_pdp_add(gtp, sk, info); if (IS_ERR(pctx)) { err = PTR_ERR(pctx); } else { gtp_tunnel_notify(pctx, GTP_CMD_NEWPDP, GFP_KERNEL); err = 0; } out_unlock: rtnl_unlock(); return err; } static struct pdp_ctx *gtp_find_pdp_by_link(struct net *net, struct nlattr *nla[]) { struct gtp_dev *gtp; int family; family = nla_get_u8_default(nla[GTPA_FAMILY], AF_INET); gtp = gtp_find_dev(net, nla); if (!gtp) return ERR_PTR(-ENODEV); if (nla[GTPA_MS_ADDRESS]) { __be32 ip = nla_get_be32(nla[GTPA_MS_ADDRESS]); if (family != AF_INET) return ERR_PTR(-EINVAL); return ipv4_pdp_find(gtp, ip); } else if (nla[GTPA_MS_ADDR6]) { struct in6_addr addr = nla_get_in6_addr(nla[GTPA_MS_ADDR6]); if (family != AF_INET6) return ERR_PTR(-EINVAL); if (addr.s6_addr32[2] || addr.s6_addr32[3]) return ERR_PTR(-EADDRNOTAVAIL); return ipv6_pdp_find(gtp, &addr); } else if (nla[GTPA_VERSION]) { u32 gtp_version = nla_get_u32(nla[GTPA_VERSION]); if (gtp_version == GTP_V0 && nla[GTPA_TID]) { return gtp0_pdp_find(gtp, nla_get_u64(nla[GTPA_TID]), family); } else if (gtp_version == GTP_V1 && nla[GTPA_I_TEI]) { return gtp1_pdp_find(gtp, nla_get_u32(nla[GTPA_I_TEI]), family); } } return ERR_PTR(-EINVAL); } static struct pdp_ctx *gtp_find_pdp(struct net *net, struct nlattr *nla[]) { struct pdp_ctx *pctx; if (nla[GTPA_LINK]) pctx = gtp_find_pdp_by_link(net, nla); else pctx = ERR_PTR(-EINVAL); if (!pctx) pctx = ERR_PTR(-ENOENT); return pctx; } static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) { struct pdp_ctx *pctx; int err = 0; if (!info->attrs[GTPA_VERSION]) return -EINVAL; rcu_read_lock(); pctx = gtp_find_pdp(sock_net(skb->sk), info->attrs); if (IS_ERR(pctx)) { err = PTR_ERR(pctx); goto out_unlock; } if (pctx->gtp_version == GTP_V0) netdev_dbg(pctx->dev, "GTPv0-U: deleting tunnel id = %llx (pdp %p)\n", pctx->u.v0.tid, pctx); else if (pctx->gtp_version == GTP_V1) netdev_dbg(pctx->dev, "GTPv1-U: deleting tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); gtp_tunnel_notify(pctx, GTP_CMD_DELPDP, GFP_ATOMIC); pdp_context_delete(pctx); out_unlock: rcu_read_unlock(); return err; } static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, int flags, u32 type, struct pdp_ctx *pctx) { void *genlh; genlh = genlmsg_put(skb, snd_portid, snd_seq, >p_genl_family, flags, type); if (genlh == NULL) goto nlmsg_failure; if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) || nla_put_u32(skb, GTPA_LINK, pctx->dev->ifindex) || nla_put_u8(skb, GTPA_FAMILY, pctx->af)) goto nla_put_failure; switch (pctx->af) { case AF_INET: if (nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms.addr.s_addr)) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(skb, GTPA_MS_ADDR6, &pctx->ms.addr6)) goto nla_put_failure; break; } switch (pctx->sk->sk_family) { case AF_INET: if (nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer.addr.s_addr)) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(skb, GTPA_PEER_ADDR6, &pctx->peer.addr6)) goto nla_put_failure; break; } switch (pctx->gtp_version) { case GTP_V0: if (nla_put_u64_64bit(skb, GTPA_TID, pctx->u.v0.tid, GTPA_PAD) || nla_put_u16(skb, GTPA_FLOW, pctx->u.v0.flow)) goto nla_put_failure; break; case GTP_V1: if (nla_put_u32(skb, GTPA_I_TEI, pctx->u.v1.i_tei) || nla_put_u32(skb, GTPA_O_TEI, pctx->u.v1.o_tei)) goto nla_put_failure; break; } genlmsg_end(skb, genlh); return 0; nlmsg_failure: nla_put_failure: genlmsg_cancel(skb, genlh); return -EMSGSIZE; } static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd, gfp_t allocation) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, allocation); if (!msg) return -ENOMEM; ret = gtp_genl_fill_info(msg, 0, 0, 0, cmd, pctx); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_multicast_netns(>p_genl_family, dev_net(pctx->dev), msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC); return ret; } static int gtp_genl_get_pdp(struct sk_buff *skb, struct genl_info *info) { struct pdp_ctx *pctx = NULL; struct sk_buff *skb2; int err; if (!info->attrs[GTPA_VERSION]) return -EINVAL; rcu_read_lock(); pctx = gtp_find_pdp(sock_net(skb->sk), info->attrs); if (IS_ERR(pctx)) { err = PTR_ERR(pctx); goto err_unlock; } skb2 = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) { err = -ENOMEM; goto err_unlock; } err = gtp_genl_fill_info(skb2, NETLINK_CB(skb).portid, info->snd_seq, 0, info->nlhdr->nlmsg_type, pctx); if (err < 0) goto err_unlock_free; rcu_read_unlock(); return genlmsg_unicast(genl_info_net(info), skb2, info->snd_portid); err_unlock_free: kfree_skb(skb2); err_unlock: rcu_read_unlock(); return err; } static int gtp_genl_dump_pdp(struct sk_buff *skb, struct netlink_callback *cb) { struct gtp_dev *last_gtp = (struct gtp_dev *)cb->args[2], *gtp; int i, j, bucket = cb->args[0], skip = cb->args[1]; struct net *net = sock_net(skb->sk); struct net_device *dev; struct pdp_ctx *pctx; if (cb->args[4]) return 0; rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (dev->rtnl_link_ops != >p_link_ops) continue; gtp = netdev_priv(dev); if (last_gtp && last_gtp != gtp) continue; else last_gtp = NULL; for (i = bucket; i < gtp->hash_size; i++) { j = 0; hlist_for_each_entry_rcu(pctx, >p->tid_hash[i], hlist_tid) { if (j >= skip && gtp_genl_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh->nlmsg_type, pctx)) { cb->args[0] = i; cb->args[1] = j; cb->args[2] = (unsigned long)gtp; goto out; } j++; } skip = 0; } bucket = 0; } cb->args[4] = 1; out: rcu_read_unlock(); return skb->len; } static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *skb_to_send; __be32 src_ip, dst_ip; unsigned int version; struct gtp_dev *gtp; struct flowi4 fl4; struct rtable *rt; struct sock *sk; __be16 port; int len; if (!info->attrs[GTPA_VERSION] || !info->attrs[GTPA_LINK] || !info->attrs[GTPA_PEER_ADDRESS] || !info->attrs[GTPA_MS_ADDRESS]) return -EINVAL; version = nla_get_u32(info->attrs[GTPA_VERSION]); dst_ip = nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]); src_ip = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); gtp = gtp_find_dev(sock_net(skb->sk), info->attrs); if (!gtp) return -ENODEV; if (!gtp->sk_created) return -EOPNOTSUPP; if (!(gtp->dev->flags & IFF_UP)) return -ENETDOWN; if (version == GTP_V0) { struct gtp0_header *gtp0_h; len = LL_RESERVED_SPACE(gtp->dev) + sizeof(struct gtp0_header) + sizeof(struct iphdr) + sizeof(struct udphdr); skb_to_send = netdev_alloc_skb_ip_align(gtp->dev, len); if (!skb_to_send) return -ENOMEM; sk = gtp->sk0; port = htons(GTP0_PORT); gtp0_h = skb_push(skb_to_send, sizeof(struct gtp0_header)); memset(gtp0_h, 0, sizeof(struct gtp0_header)); gtp0_build_echo_msg(gtp0_h, GTP_ECHO_REQ); } else if (version == GTP_V1) { struct gtp1_header_long *gtp1u_h; len = LL_RESERVED_SPACE(gtp->dev) + sizeof(struct gtp1_header_long) + sizeof(struct iphdr) + sizeof(struct udphdr); skb_to_send = netdev_alloc_skb_ip_align(gtp->dev, len); if (!skb_to_send) return -ENOMEM; sk = gtp->sk1u; port = htons(GTP1U_PORT); gtp1u_h = skb_push(skb_to_send, sizeof(struct gtp1_header_long)); memset(gtp1u_h, 0, sizeof(struct gtp1_header_long)); gtp1u_build_echo_msg(gtp1u_h, GTP_ECHO_REQ); } else { return -ENODEV; } rt = ip4_route_output_gtp(&fl4, sk, dst_ip, src_ip); if (IS_ERR(rt)) { netdev_dbg(gtp->dev, "no route for echo request to %pI4\n", &dst_ip); kfree_skb(skb_to_send); return -ENODEV; } local_bh_disable(); udp_tunnel_xmit_skb(rt, sk, skb_to_send, fl4.saddr, fl4.daddr, inet_dscp_to_dsfield(fl4.flowi4_dscp), ip4_dst_hoplimit(&rt->dst), 0, port, port, !net_eq(sock_net(sk), dev_net(gtp->dev)), false, 0); local_bh_enable(); return 0; } static const struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = { [GTPA_LINK] = { .type = NLA_U32, }, [GTPA_VERSION] = { .type = NLA_U32, }, [GTPA_TID] = { .type = NLA_U64, }, [GTPA_PEER_ADDRESS] = { .type = NLA_U32, }, [GTPA_MS_ADDRESS] = { .type = NLA_U32, }, [GTPA_FLOW] = { .type = NLA_U16, }, [GTPA_NET_NS_FD] = { .type = NLA_U32, }, [GTPA_I_TEI] = { .type = NLA_U32, }, [GTPA_O_TEI] = { .type = NLA_U32, }, [GTPA_PEER_ADDR6] = { .len = sizeof(struct in6_addr), }, [GTPA_MS_ADDR6] = { .len = sizeof(struct in6_addr), }, [GTPA_FAMILY] = { .type = NLA_U8, }, }; static const struct genl_small_ops gtp_genl_ops[] = { { .cmd = GTP_CMD_NEWPDP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_new_pdp, .flags = GENL_ADMIN_PERM, }, { .cmd = GTP_CMD_DELPDP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_del_pdp, .flags = GENL_ADMIN_PERM, }, { .cmd = GTP_CMD_GETPDP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_get_pdp, .dumpit = gtp_genl_dump_pdp, .flags = GENL_ADMIN_PERM, }, { .cmd = GTP_CMD_ECHOREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_send_echo_req, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family gtp_genl_family __ro_after_init = { .name = "gtp", .version = 0, .hdrsize = 0, .maxattr = GTPA_MAX, .policy = gtp_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = gtp_genl_ops, .n_small_ops = ARRAY_SIZE(gtp_genl_ops), .resv_start_op = GTP_CMD_ECHOREQ + 1, .mcgrps = gtp_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(gtp_genl_mcgrps), }; static int __net_init gtp_net_init(struct net *net) { struct gtp_net *gn = net_generic(net, gtp_net_id); INIT_LIST_HEAD(&gn->gtp_dev_list); return 0; } static void __net_exit gtp_net_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct gtp_net *gn = net_generic(net, gtp_net_id); struct gtp_dev *gtp, *gtp_next; list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) gtp_dellink(gtp->dev, dev_to_kill); } static struct pernet_operations gtp_net_ops = { .init = gtp_net_init, .exit_rtnl = gtp_net_exit_rtnl, .id = >p_net_id, .size = sizeof(struct gtp_net), }; static int __init gtp_init(void) { int err; get_random_bytes(>p_h_initval, sizeof(gtp_h_initval)); err = register_pernet_subsys(>p_net_ops); if (err < 0) goto error_out; err = rtnl_link_register(>p_link_ops); if (err < 0) goto unreg_pernet_subsys; err = genl_register_family(>p_genl_family); if (err < 0) goto unreg_rtnl_link; pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", sizeof(struct pdp_ctx)); return 0; unreg_rtnl_link: rtnl_link_unregister(>p_link_ops); unreg_pernet_subsys: unregister_pernet_subsys(>p_net_ops); error_out: pr_err("error loading GTP module loaded\n"); return err; } late_initcall(gtp_init); static void __exit gtp_fini(void) { genl_unregister_family(>p_genl_family); rtnl_link_unregister(>p_link_ops); unregister_pernet_subsys(>p_net_ops); pr_info("GTP module unloaded\n"); } module_exit(gtp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <hwelte@sysmocom.de>"); MODULE_DESCRIPTION("Interface driver for GTP encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("gtp"); MODULE_ALIAS_GENL_FAMILY("gtp"); |
| 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | /* SPDX-License-Identifier: GPL-2.0 */ /****************************************************************************** * x86_emulate.h * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * * Copyright (c) 2005 Keir Fraser * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ #ifndef _ASM_X86_KVM_X86_EMULATE_H #define _ASM_X86_KVM_X86_EMULATE_H #include <asm/desc_defs.h> #include "fpu.h" struct x86_emulate_ctxt; enum x86_intercept; enum x86_intercept_stage; struct x86_exception { u8 vector; bool error_code_valid; u16 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; unsigned long exit_qualification; }; /* * This struct is used to carry enough information from the instruction * decoder to main KVM so that a decision can be made whether the * instruction needs to be intercepted or not. */ struct x86_instruction_info { u8 intercept; /* which intercept */ u8 rep_prefix; /* rep prefix? */ u8 modrm_mod; /* mod part of modrm */ u8 modrm_reg; /* index of register used */ u8 modrm_rm; /* rm part of modrm */ u64 src_val; /* value of source operand */ u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ u8 src_type; /* type of source operand */ u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ u64 rip; /* rip of the instruction */ u64 next_rip; /* rip following the instruction */ }; /* * x86_emulate_ops: * * These operations represent the instruction emulator's interface to memory. * There are two categories of operation: those that act on ordinary memory * regions (*_std), and those that act on memory regions known to require * special treatment or emulation (*_emulated). * * The emulator assumes that an instruction accesses only one 'emulated memory' * location, that this location is the given linear faulting address (cr2), and * that this is one of the instruction's data operands. Instruction fetches and * stack operations are assumed never to access emulated memory. The emulator * automatically deduces which operand of a string-move operation is accessing * emulated memory, and assumes that the other operand accesses normal memory. * * NOTES: * 1. The emulator isn't very smart about emulated vs. standard memory. * 'Emulated memory' access addresses should be checked for sanity. * 'Normal memory' accesses may fault, and the caller must arrange to * detect and handle reentrancy into the emulator via recursive faults. * Accesses may be unaligned and may cross page boundaries. * 2. If the access fails (cannot emulate, or a standard access faults) then * it is up to the memop to propagate the fault to the guest VM via * some out-of-band mechanism, unknown to the emulator. The memop signals * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will * then immediately bail. * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only * cmpxchg8b_emulated need support 8-byte accesses. * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. */ /* Access completed successfully: continue emulation as normal. */ #define X86EMUL_CONTINUE 0 /* Access is unhandleable: bail from emulation and return error to caller. */ #define X86EMUL_UNHANDLEABLE 1 /* Terminate emulation but return success to the caller. */ #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ /* Emulation during event vectoring is unhandleable. */ #define X86EMUL_UNHANDLEABLE_VECTORING 7 /* x86-specific emulation flags */ #define X86EMUL_F_WRITE BIT(0) #define X86EMUL_F_FETCH BIT(1) #define X86EMUL_F_IMPLICIT BIT(2) #define X86EMUL_F_INVLPG BIT(3) #define X86EMUL_F_MSR BIT(4) #define X86EMUL_F_DT_LOAD BIT(5) struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * * @reg: gpr number. */ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg); /* * write_gpr: write a general purpose register (rax - r15) * * @reg: gpr number. * @val: value to write. */ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val); /* * read_std: Read bytes of standard (non-emulated/special) memory. * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. * @system:[IN ] Whether the access is forced to be at CPL0. */ int (*read_std)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault, bool system); /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. * @addr: [IN ] Linear address to which to write. * @val: [OUT] Value write to memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to write to memory. * @system:[IN ] Whether the access is forced to be at CPL0. */ int (*write_std)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault, bool system); /* * fetch: Read bytes of standard (non-emulated/special) memory. * Used for instruction fetch. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*fetch)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * read_emulated: Read bytes from emulated/special memory area. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_emulated)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). * @bytes: [IN ] Number of bytes to write to memory. */ int (*write_emulated)(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *val, unsigned int bytes, struct x86_exception *fault); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an * emulated/special memory area. * @addr: [IN ] Linear address to access. * @old: [IN ] Value expected to be current at @addr. * @new: [IN ] Value to write to @addr. * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *old, const void *new, unsigned int bytes, struct x86_exception *fault); void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, void *val, unsigned int count); int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, const void *val, unsigned int count); bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector, struct desc_struct *desc, u32 *base3, int seg); void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc, u32 base3, int seg); unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, int seg); void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); bool (*is_smm)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags); bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags); bool (*page_address_valid)(struct x86_emulate_ctxt *ctxt, gpa_t gpa); }; /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type; unsigned int bytes; unsigned int count; union { unsigned long orig_val; u64 orig_val64; }; union { unsigned long *reg; struct segmented_address { ulong ea; unsigned seg; } mem; unsigned xmm; unsigned mm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(avx256_t)]; sse128_t vec_val; avx256_t vec_val2; u64 mm_val; void *data; } __aligned(32); }; #define X86_MAX_INSTRUCTION_LENGTH 15 struct fetch_cache { u8 data[X86_MAX_INSTRUCTION_LENGTH]; u8 *ptr; u8 *end; }; struct read_cache { u8 data[1024]; unsigned long pos; unsigned long end; }; /* Execution mode, passed to the emulator. */ enum x86emul_mode { X86EMUL_MODE_REAL, /* Real mode. */ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; /* * fastop functions are declared as taking a never-defined fastop parameter, * so they can't be called from C directly. */ struct fastop; typedef void (*fastop_t)(struct fastop *); /* * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is * tracked/accessed via _eip, and except for RIP relative addressing, which * also uses _eip, RIP cannot be a register operand nor can it be an operand in * a ModRM or SIB byte. */ #ifdef CONFIG_X86_64 #define NR_EMULATOR_GPRS 16 #else #define NR_EMULATOR_GPRS 8 #endif /* * Distinguish between no prefix, REX, or in the future REX2. */ enum rex_type { REX_NONE, REX_PREFIX, }; struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ enum x86emul_mode mode; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; bool perm_ok; /* do not check permissions if true */ bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; /* GPA available */ bool gpa_available; gpa_t gpa_val; /* * decode cache */ /* current opcode length in bytes */ u8 opcode_len; u8 b; u8 intercept; bool op_prefix; u8 op_bytes; u8 ad_bytes; union { int (*execute)(struct x86_emulate_ctxt *ctxt); fastop_t fop; }; int (*check_perm)(struct x86_emulate_ctxt *ctxt); bool rip_relative; enum rex_type rex_prefix; u8 rex_bits; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; u8 seg_override; u64 d; unsigned long _eip; /* Here begins the usercopy section. */ struct operand src; struct operand src2; struct operand dst; struct operand memop; unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; bool is_branch; }; #define KVM_EMULATOR_BUG_ON(cond, ctxt) \ ({ \ int __ret = (cond); \ \ if (WARN_ON_ONCE(__ret)) \ ctxt->ops->vm_bugged(ctxt); \ unlikely(__ret); \ }) /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 /* CPUID vendors */ #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 #define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948 #define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975 #define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e #define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 #define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e #define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 #define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543 #define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561 #define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561 static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx) { return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; } static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx) { return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) || (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx); } static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx) { return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx; } enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, X86_ICPT_POST_EXCEPT, X86_ICPT_POST_MEMACCESS, }; enum x86_intercept { x86_intercept_none, x86_intercept_cr_read, x86_intercept_cr_write, x86_intercept_clts, x86_intercept_lmsw, x86_intercept_smsw, x86_intercept_dr_read, x86_intercept_dr_write, x86_intercept_lidt, x86_intercept_sidt, x86_intercept_lgdt, x86_intercept_sgdt, x86_intercept_lldt, x86_intercept_sldt, x86_intercept_ltr, x86_intercept_str, x86_intercept_rdtsc, x86_intercept_rdpmc, x86_intercept_pushf, x86_intercept_popf, x86_intercept_cpuid, x86_intercept_rsm, x86_intercept_iret, x86_intercept_intn, x86_intercept_invd, x86_intercept_pause, x86_intercept_hlt, x86_intercept_invlpg, x86_intercept_invlpga, x86_intercept_vmrun, x86_intercept_vmload, x86_intercept_vmsave, x86_intercept_vmmcall, x86_intercept_stgi, x86_intercept_clgi, x86_intercept_skinit, x86_intercept_rdtscp, x86_intercept_rdpid, x86_intercept_icebp, x86_intercept_wbinvd, x86_intercept_monitor, x86_intercept_mwait, x86_intercept_rdmsr, x86_intercept_wrmsr, x86_intercept_in, x86_intercept_ins, x86_intercept_out, x86_intercept_outs, x86_intercept_xsetbv, nr_x86_intercepts }; /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 #elif defined(CONFIG_X86_64) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) nr &= NR_EMULATOR_GPRS - 1; if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); } return ctxt->_regs[nr]; } static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) nr &= NR_EMULATOR_GPRS - 1; BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; } static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) { reg_read(ctxt, nr); return reg_write(ctxt, nr); } #endif /* _ASM_X86_KVM_X86_EMULATE_H */ |
| 1 1 1 1 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 | /* * linux/fs/hfs/super.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains hfs_read_super(), some of the super_ops and * init_hfs_fs() and exit_hfs_fs(). The remaining super_ops are in * inode.c since they deal with inodes. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/vfs.h> #include "hfs_fs.h" #include "btree.h" static struct kmem_cache *hfs_inode_cachep; MODULE_DESCRIPTION("Apple Macintosh file system support"); MODULE_LICENSE("GPL"); static int hfs_sync_fs(struct super_block *sb, int wait) { is_hfs_cnid_counts_valid(sb); hfs_mdb_commit(sb); return 0; } /* * hfs_put_super() * * This is the put_super() entry in the super_operations structure for * HFS filesystems. The purpose is to release the resources * associated with the superblock sb. */ static void hfs_put_super(struct super_block *sb) { cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work); hfs_mdb_close(sb); /* release the MDB's resources */ hfs_mdb_put(sb); } static void flush_mdb(struct work_struct *work) { struct hfs_sb_info *sbi; struct super_block *sb; sbi = container_of(work, struct hfs_sb_info, mdb_work.work); sb = sbi->sb; spin_lock(&sbi->work_lock); sbi->work_queued = 0; spin_unlock(&sbi->work_lock); is_hfs_cnid_counts_valid(sb); hfs_mdb_commit(sb); } void hfs_mark_mdb_dirty(struct super_block *sb) { struct hfs_sb_info *sbi = HFS_SB(sb); unsigned long delay; if (sb_rdonly(sb)) return; spin_lock(&sbi->work_lock); if (!sbi->work_queued) { delay = msecs_to_jiffies(dirty_writeback_interval * 10); queue_delayed_work(system_long_wq, &sbi->mdb_work, delay); sbi->work_queued = 1; } spin_unlock(&sbi->work_lock); } /* * hfs_statfs() * * This is the statfs() entry in the super_operations structure for * HFS filesystems. The purpose is to return various data about the * filesystem. * * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks. */ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div; buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div; buf->f_bavail = buf->f_bfree; buf->f_files = HFS_SB(sb)->fs_ablocks; buf->f_ffree = HFS_SB(sb)->free_ablocks; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = HFS_NAMELEN; return 0; } static int hfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; sync_filesystem(sb); fc->sb_flags |= SB_NODIRATIME; if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; if (!(fc->sb_flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; fc->sb_flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; fc->sb_flags |= SB_RDONLY; } } return 0; } static int hfs_show_options(struct seq_file *seq, struct dentry *root) { struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid), from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_file_umask != 0133) seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); if (sbi->s_dir_umask != 0022) seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask); if (sbi->part >= 0) seq_printf(seq, ",part=%u", sbi->part); if (sbi->session >= 0) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls_disk) seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset); if (sbi->nls_io) seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset); if (sbi->s_quiet) seq_printf(seq, ",quiet"); return 0; } static struct inode *hfs_alloc_inode(struct super_block *sb) { struct hfs_inode_info *i; i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } static void hfs_free_inode(struct inode *inode) { kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } static const struct super_operations hfs_super_operations = { .alloc_inode = hfs_alloc_inode, .free_inode = hfs_free_inode, .write_inode = hfs_write_inode, .evict_inode = hfs_evict_inode, .put_super = hfs_put_super, .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, .show_options = hfs_show_options, }; enum { opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, opt_part, opt_session, opt_type, opt_creator, opt_quiet, opt_codepage, opt_iocharset, }; static const struct fs_parameter_spec hfs_param_spec[] = { fsparam_u32 ("uid", opt_uid), fsparam_u32 ("gid", opt_gid), fsparam_u32oct ("umask", opt_umask), fsparam_u32oct ("file_umask", opt_file_umask), fsparam_u32oct ("dir_umask", opt_dir_umask), fsparam_u32 ("part", opt_part), fsparam_u32 ("session", opt_session), fsparam_string ("type", opt_type), fsparam_string ("creator", opt_creator), fsparam_flag ("quiet", opt_quiet), fsparam_string ("codepage", opt_codepage), fsparam_string ("iocharset", opt_iocharset), {} }; /* * hfs_parse_param() * * This function is called by the vfs to parse the mount options. */ static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hfs_sb_info *hsb = fc->s_fs_info; struct fs_parse_result result; int opt; /* hfs does not honor any fs-specific options on remount */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) return 0; opt = fs_parse(fc, hfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case opt_uid: hsb->s_uid = result.uid; break; case opt_gid: hsb->s_gid = result.gid; break; case opt_umask: hsb->s_file_umask = (umode_t)result.uint_32; hsb->s_dir_umask = (umode_t)result.uint_32; break; case opt_file_umask: hsb->s_file_umask = (umode_t)result.uint_32; break; case opt_dir_umask: hsb->s_dir_umask = (umode_t)result.uint_32; break; case opt_part: hsb->part = result.uint_32; break; case opt_session: hsb->session = result.uint_32; break; case opt_type: if (strlen(param->string) != 4) { pr_err("type requires a 4 character value\n"); return -EINVAL; } memcpy(&hsb->s_type, param->string, 4); break; case opt_creator: if (strlen(param->string) != 4) { pr_err("creator requires a 4 character value\n"); return -EINVAL; } memcpy(&hsb->s_creator, param->string, 4); break; case opt_quiet: hsb->s_quiet = 1; break; case opt_codepage: if (hsb->nls_disk) { pr_err("unable to change codepage\n"); return -EINVAL; } hsb->nls_disk = load_nls(param->string); if (!hsb->nls_disk) { pr_err("unable to load codepage \"%s\"\n", param->string); return -EINVAL; } break; case opt_iocharset: if (hsb->nls_io) { pr_err("unable to change iocharset\n"); return -EINVAL; } hsb->nls_io = load_nls(param->string); if (!hsb->nls_io) { pr_err("unable to load iocharset \"%s\"\n", param->string); return -EINVAL; } break; default: return -EINVAL; } return 0; } /* * hfs_read_super() * * This is the function that is responsible for mounting an HFS * filesystem. It performs all the tasks necessary to get enough data * from the disk to read the root inode. This includes parsing the * mount options, dealing with Macintosh partitions, reading the * superblock and the allocation bitmap blocks, calling * hfs_btree_init() to get the necessary data about the extents and * catalog B-trees and, finally, reading the root inode into memory. */ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hfs_sb_info *sbi = HFS_SB(sb); struct hfs_find_data fd; hfs_cat_rec rec; struct inode *root_inode; int silent = fc->sb_flags & SB_SILENT; int res; atomic64_set(&sbi->file_count, 0); atomic64_set(&sbi->folder_count, 0); atomic64_set(&sbi->next_id, 0); /* load_nls_default does not fail */ if (sbi->nls_disk && !sbi->nls_io) sbi->nls_io = load_nls_default(); sbi->s_dir_umask &= 0777; sbi->s_file_umask &= 0577; spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); sbi->sb = sb; sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; sb->s_flags |= SB_NODIRATIME; mutex_init(&sbi->bitmap_lock); res = hfs_mdb_get(sb); if (res) { if (!silent) pr_warn("can't find a HFS filesystem on dev %s\n", hfs_mdb_name(sb)); res = -EINVAL; goto bail; } /* try to get the root inode */ res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { if (fd.entrylength != sizeof(rec.dir)) { res = -EIO; goto bail_hfs_find; } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); if (rec.type != HFS_CDR_DIR) res = -EIO; } if (res) goto bail_hfs_find; res = -EINVAL; root_inode = hfs_iget(sb, &fd.search_key->cat, &rec); hfs_find_exit(&fd); if (!root_inode) goto bail_no_root; set_default_d_op(sb, &hfs_dentry_operations); res = -ENOMEM; sb->s_root = d_make_root(root_inode); if (!sb->s_root) goto bail_no_root; /* everything's okay */ return 0; bail_hfs_find: hfs_find_exit(&fd); bail_no_root: pr_err("get root inode failed\n"); bail: hfs_mdb_put(sb); return res; } static int hfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, hfs_fill_super); } static void hfs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations hfs_context_ops = { .parse_param = hfs_parse_param, .get_tree = hfs_get_tree, .reconfigure = hfs_reconfigure, .free = hfs_free_fc, }; static int hfs_init_fs_context(struct fs_context *fc) { struct hfs_sb_info *hsb; hsb = kzalloc_obj(struct hfs_sb_info); if (!hsb) return -ENOMEM; fc->s_fs_info = hsb; fc->ops = &hfs_context_ops; if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { /* initialize options with defaults */ hsb->s_uid = current_uid(); hsb->s_gid = current_gid(); hsb->s_file_umask = 0133; hsb->s_dir_umask = 0022; hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */ hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ hsb->s_quiet = 0; hsb->part = -1; hsb->session = -1; } return 0; } static void hfs_kill_super(struct super_block *sb) { struct hfs_sb_info *hsb = HFS_SB(sb); kill_block_super(sb); kfree(hsb); } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", .kill_sb = hfs_kill_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = hfs_init_fs_context, }; MODULE_ALIAS_FS("hfs"); static void hfs_init_once(void *p) { struct hfs_inode_info *i = p; inode_init_once(&i->vfs_inode); } static int __init init_hfs_fs(void) { int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); if (err) kmem_cache_destroy(hfs_inode_cachep); return err; } static void __exit exit_hfs_fs(void) { unregister_filesystem(&hfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hfs_inode_cachep); } module_init(init_hfs_fs) module_exit(exit_hfs_fs) |
| 2 2 32 786 786 4 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> struct macvtap_dev { struct macvlan_dev vlan; struct tap_dev tap; }; /* * Variables for dealing with macvtaps device numbers. */ static dev_t macvtap_major; static const struct ns_common *macvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return to_ns_common(dev_net(dev)); } static struct class macvtap_class = { .name = "macvtap", .ns_type = &net_ns_type_operations, .namespace = macvtap_net_namespace, }; static struct cdev macvtap_cdev; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static void macvtap_count_tx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_dropped); } static void macvtap_count_rx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; macvlan_count_rx(vlan, 0, 0, 0); } static void macvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; vlan->set_features = features; netdev_update_features(vlan->dev); } static int macvtap_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; /* Register callbacks for rx/tx drops accounting and updating * net_device features */ vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped; vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped; vlantap->tap.update_features = macvtap_update_features; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = macvlan_common_newlink(dev, params, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return 0; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { struct macvtap_dev *vlantap = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlantap->tap); macvlan_dellink(dev, head); } static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; } static struct net *macvtap_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &macvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(macvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); classdev = device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(macvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); device_destroy(&macvtap_class, devt); tap_free_minor(macvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block macvtap_notifier_block __read_mostly = { .notifier_call = macvtap_device_event, }; static int __init macvtap_init(void) { int err; err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", THIS_MODULE); if (err) goto out1; err = class_register(&macvtap_class); if (err) goto out2; err = register_netdevice_notifier(&macvtap_notifier_block); if (err) goto out3; err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&macvtap_notifier_block); out3: class_unregister(&macvtap_class); out2: tap_destroy_cdev(macvtap_major, &macvtap_cdev); out1: return err; } module_init(macvtap_init); static void __exit macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); unregister_netdevice_notifier(&macvtap_notifier_block); class_unregister(&macvtap_class); tap_destroy_cdev(macvtap_major, &macvtap_cdev); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_DESCRIPTION("MAC-VLAN based tap driver"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_LICENSE("GPL"); |
| 11 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/sgi.c * * Code extracted from drivers/block/genhd.c */ #include "check.h" #define SGI_LABEL_MAGIC 0x0be5a941 enum { LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ }; struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ __be16 root_part_num; /* Root partition number */ __be16 swap_part_num; /* Swap partition number */ s8 boot_file[16]; /* Name of boot file for ARCS */ u8 _unused0[48]; /* Device parameter useless crapola.. */ struct sgi_volume { s8 name[8]; /* Name of volume */ __be32 block_num; /* Logical block number */ __be32 num_bytes; /* How big, in bytes */ } volume[15]; struct sgi_partition { __be32 num_blocks; /* Size in logical blocks */ __be32 first_block; /* First logical block */ __be32 type; /* Type of this partition */ } partitions[16]; __be32 csum; /* Disk label checksum */ __be32 _unused1; /* Padding */ }; int sgi_partition(struct parsed_partitions *state) { int i, csum; __be32 magic; int slot = 1; unsigned int start, blocks; __be32 *ui, cs; Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; label = read_part_sector(state, 0, §); if (!label) return -1; p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { put_dev_sector(sect); return 0; } ui = ((__be32 *) (label + 1)) - 1; for(csum = 0; ui >= ((__be32 *) label);) { cs = *ui--; csum += be32_to_cpu(cs); } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", state->disk->disk_name); put_dev_sector(sect); return 0; } /* All SGI disk labels have 16 partitions, disks under Linux only * have 15 minor's. Luckily there are always a few zero length * partitions which we don't care about so we never overflow the * current_minor. */ for(i = 0; i < 16; i++, p++) { blocks = be32_to_cpu(p->num_blocks); start = be32_to_cpu(p->first_block); if (blocks) { put_partition(state, slot, start, blocks); if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; } slot++; } seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } |
| 26 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INTERNAL_IO_SLIST_H #define INTERNAL_IO_SLIST_H #include <linux/io_uring_types.h> #define __wq_list_for_each(pos, head) \ for (pos = (head)->first; pos; pos = (pos)->next) #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) #define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ (list)->first = NULL; \ } while (0) static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) { struct io_wq_work_node *next = pos->next; pos->next = node; node->next = next; if (!next) list->last = node; } static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { node->next = NULL; if (!list->first) { list->last = node; WRITE_ONCE(list->first, node); } else { list->last->next = node; list->last = node; } } static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) { /* first in the list, if prev==NULL */ if (!prev) WRITE_ONCE(list->first, last->next); else prev->next = last->next; if (last == list->last) list->last = prev; last->next = NULL; } static inline void wq_stack_add_head(struct io_wq_work_node *node, struct io_wq_work_node *stack) { node->next = stack->next; stack->next = node; } static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) { wq_list_cut(list, node, prev); } static inline struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) { struct io_wq_work_node *node = stack->next; stack->next = node->next; return node; } static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) { if (!work->list.next) return NULL; return container_of(work->list.next, struct io_wq_work, list); } #endif // INTERNAL_IO_SLIST_H |
| 1 1 1 2 1 1 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #include <linux/fs.h> #include <linux/module.h> #include <linux/completion.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_inode.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_acl.h" #include "jfs_debug.h" #include "jfs_xattr.h" #include "jfs_dinode.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); MODULE_LICENSE("GPL"); static struct kmem_cache *jfs_inode_cachep; static const struct super_operations jfs_super_operations; static const struct export_operations jfs_export_operations; static struct file_system_type jfs_fs_type; #define MAX_COMMIT_THREADS 64 static int commit_threads; module_param(commit_threads, int, 0); MODULE_PARM_DESC(commit_threads, "Number of commit threads"); static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS]; struct task_struct *jfsIOthread; struct task_struct *jfsSyncThread; #ifdef CONFIG_JFS_DEBUG int jfsloglevel = JFS_LOGLEVEL_WARN; module_param(jfsloglevel, int, 0644); MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); #endif static void jfs_handle_error(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); if (sb_rdonly(sb)) return; updateSuper(sb, FM_DIRTY); if (sbi->flag & JFS_ERR_PANIC) panic("JFS (device %s): panic forced after error\n", sb->s_id); else if (sbi->flag & JFS_ERR_REMOUNT_RO) { jfs_err("ERROR: (device %s): remounting filesystem as read-only", sb->s_id); sb->s_flags |= SB_RDONLY; } /* nothing is done for continue beyond marking the superblock dirty */ } void jfs_error(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("ERROR: (device %s): %ps: %pV\n", sb->s_id, __builtin_return_address(0), &vaf); va_end(args); jfs_handle_error(sb); } static struct inode *jfs_alloc_inode(struct super_block *sb) { struct jfs_inode_info *jfs_inode; jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS); if (!jfs_inode) return NULL; #ifdef CONFIG_QUOTA memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot)); #endif return &jfs_inode->vfs_inode; } static void jfs_free_inode(struct inode *inode) { kmem_cache_free(jfs_inode_cachep, JFS_IP(inode)); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); s64 maxinodes; struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; jfs_info("In jfs_statfs"); buf->f_type = JFS_SUPER_MAGIC; buf->f_bsize = sbi->bsize; buf->f_blocks = sbi->bmap->db_mapsize; buf->f_bfree = sbi->bmap->db_nfree; buf->f_bavail = sbi->bmap->db_nfree; /* * If we really return the number of allocated & free inodes, some * applications will fail because they won't see enough free inodes. * We'll try to calculate some guess as to how many inodes we can * really allocate * * buf->f_files = atomic_read(&imap->im_numinos); * buf->f_ffree = atomic_read(&imap->im_numfree); */ maxinodes = min((s64) atomic_read(&imap->im_numinos) + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) << L2INOSPEREXT), (s64) 0xffffffffLL); buf->f_files = maxinodes; buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - atomic_read(&imap->im_numfree)); buf->f_fsid.val[0] = crc32_le(0, (char *)&sbi->uuid, sizeof(sbi->uuid)/2); buf->f_fsid.val[1] = crc32_le(0, (char *)&sbi->uuid + sizeof(sbi->uuid)/2, sizeof(sbi->uuid)/2); buf->f_namelen = JFS_NAME_MAX; return 0; } #ifdef CONFIG_QUOTA static int jfs_quota_off(struct super_block *sb, int type); static int jfs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static void jfs_quota_off_umount(struct super_block *sb) { int type; for (type = 0; type < MAXQUOTAS; type++) jfs_quota_off(sb, type); } static const struct quotactl_ops jfs_quotactl_ops = { .quota_on = jfs_quota_on, .quota_off = jfs_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #else static inline void jfs_quota_off_umount(struct super_block *sb) { } #endif static void jfs_put_super(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); int rc; jfs_info("In jfs_put_super"); jfs_quota_off_umount(sb); rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); unload_nls(sbi->nls_tab); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); iput(sbi->direct_inode); kfree(sbi); } enum { Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, Opt_discard, Opt_nodiscard, Opt_discard_minblk }; static const struct constant_table jfs_param_errors[] = { {"continue", JFS_ERR_CONTINUE}, {"remount-ro", JFS_ERR_REMOUNT_RO}, {"panic", JFS_ERR_PANIC}, {} }; static const struct fs_parameter_spec jfs_param_spec[] = { fsparam_flag_no ("integrity", Opt_integrity), fsparam_string ("iocharset", Opt_iocharset), fsparam_u64 ("resize", Opt_resize), fsparam_flag ("resize", Opt_resize_nosize), fsparam_enum ("errors", Opt_errors, jfs_param_errors), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_ignore), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_flag ("discard", Opt_discard), fsparam_u32 ("discard", Opt_discard_minblk), fsparam_flag ("nodiscard", Opt_nodiscard), {} }; struct jfs_context { int flag; kuid_t uid; kgid_t gid; uint umask; uint minblks_trim; void *nls_map; bool resize; s64 newLVSize; }; static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct jfs_context *ctx = fc->fs_private; int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE); struct fs_parse_result result; struct nls_table *nls_map; int opt; opt = fs_parse(fc, jfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_integrity: if (result.negated) ctx->flag |= JFS_NOINTEGRITY; else ctx->flag &= ~JFS_NOINTEGRITY; break; case Opt_ignore: /* Silently ignore the quota options */ /* Don't do anything ;-) */ break; case Opt_iocharset: if (ctx->nls_map && ctx->nls_map != (void *) -1) { unload_nls(ctx->nls_map); ctx->nls_map = NULL; } if (!strcmp(param->string, "none")) ctx->nls_map = NULL; else { nls_map = load_nls(param->string); if (!nls_map) { pr_err("JFS: charset not found\n"); return -EINVAL; } ctx->nls_map = nls_map; } break; case Opt_resize: if (!reconfigure) return -EINVAL; ctx->resize = true; ctx->newLVSize = result.uint_64; break; case Opt_resize_nosize: if (!reconfigure) return -EINVAL; ctx->resize = true; break; case Opt_errors: ctx->flag &= ~JFS_ERR_MASK; ctx->flag |= result.uint_32; break; #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: ctx->flag |= JFS_USRQUOTA; break; case Opt_grpquota: ctx->flag |= JFS_GRPQUOTA; break; #else case Opt_usrquota: case Opt_grpquota: case Opt_quota: pr_err("JFS: quota operations not supported\n"); break; #endif case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_umask: if (result.uint_32 & ~0777) { pr_err("JFS: Invalid value of umask\n"); return -EINVAL; } ctx->umask = result.uint_32; break; case Opt_discard: /* if set to 1, even copying files will cause * trimming :O * -> user has more control over the online trimming */ ctx->minblks_trim = 64; ctx->flag |= JFS_DISCARD; break; case Opt_nodiscard: ctx->flag &= ~JFS_DISCARD; break; case Opt_discard_minblk: ctx->minblks_trim = result.uint_32; ctx->flag |= JFS_DISCARD; break; default: return -EINVAL; } return 0; } static int jfs_reconfigure(struct fs_context *fc) { struct jfs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; int readonly = fc->sb_flags & SB_RDONLY; int rc = 0; int flag = ctx->flag; int ret; sync_filesystem(sb); /* Transfer results of parsing to the sbi */ JFS_SBI(sb)->flag = ctx->flag; JFS_SBI(sb)->uid = ctx->uid; JFS_SBI(sb)->gid = ctx->gid; JFS_SBI(sb)->umask = ctx->umask; JFS_SBI(sb)->minblks_trim = ctx->minblks_trim; if (ctx->nls_map != (void *) -1) { unload_nls(JFS_SBI(sb)->nls_tab); JFS_SBI(sb)->nls_tab = ctx->nls_map; } ctx->nls_map = NULL; if (ctx->resize) { if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } if (!ctx->newLVSize) { ctx->newLVSize = sb_bdev_nr_blocks(sb); if (ctx->newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); } rc = jfs_extendfs(sb, ctx->newLVSize, 0); if (rc) return rc; } if (sb_rdonly(sb) && !readonly) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o */ truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); /* mark the fs r/w for quota activity */ sb->s_flags &= ~SB_RDONLY; dquot_resume(sb, -1); return ret; } if (!sb_rdonly(sb) && readonly) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; return rc; } if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) { if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) return rc; JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); return ret; } } JFS_SBI(sb)->flag = flag; return 0; } static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct jfs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; struct jfs_sb_info *sbi; struct inode *inode; int rc; int ret = -EINVAL; jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); sbi = kzalloc_obj(struct jfs_sb_info); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; sb->s_max_links = JFS_LINK_MAX; sb->s_time_min = 0; sb->s_time_max = U32_MAX; sbi->sb = sb; /* Transfer results of parsing to the sbi */ sbi->flag = ctx->flag; sbi->uid = ctx->uid; sbi->gid = ctx->gid; sbi->umask = ctx->umask; if (ctx->nls_map != (void *) -1) { unload_nls(sbi->nls_tab); sbi->nls_tab = ctx->nls_map; } ctx->nls_map = NULL; if (sbi->flag & JFS_DISCARD) { if (!bdev_max_discard_sectors(sb->s_bdev)) { pr_err("JFS: discard option not supported on device\n"); sbi->flag &= ~JFS_DISCARD; } else { sbi->minblks_trim = ctx->minblks_trim; } } #ifdef CONFIG_JFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif if (ctx->resize) { pr_err("resize option for remount only\n"); goto out_unload; } /* * Initialize blocksize to 4K. */ sb_set_blocksize(sb, PSIZE); /* * Set method vectors. */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &jfs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif /* * Initialize direct-mapping inode/address-space */ inode = new_inode(sb); if (inode == NULL) { ret = -ENOMEM; goto out_unload; } inode->i_size = bdev_nr_bytes(sb->s_bdev); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); sbi->direct_inode = inode; rc = jfs_mount(sb); if (rc) { if (!silent) jfs_err("jfs_mount failed w/return code = %d", rc); goto out_mount_failed; } if (sb_rdonly(sb)) sbi->log = NULL; else { rc = jfs_mount_rw(sb, 0); if (rc) { if (!silent) { jfs_err("jfs_mount_rw failed, return code = %d", rc); } goto out_no_rw; } } sb->s_magic = JFS_SUPER_MAGIC; if (sbi->mntflag & JFS_OS2) set_default_d_op(sb, &jfs_ci_dentry_operations); inode = jfs_iget(sb, ROOT_I); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out_no_rw; } sb->s_root = d_make_root(inode); if (!sb->s_root) goto out_no_root; /* logical blocks are represented by 40 bits in pxd_t, etc. * and page cache is indexed by long */ sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); sb->s_time_gran = 1; return 0; out_no_root: jfs_err("jfs_read_super: get root dentry failed"); out_no_rw: rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); out_mount_failed: filemap_write_and_wait(sbi->direct_inode->i_mapping); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); make_bad_inode(sbi->direct_inode); iput(sbi->direct_inode); sbi->direct_inode = NULL; out_unload: unload_nls(sbi->nls_tab); kfree(sbi); return ret; } static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; int rc = 0; if (!sb_rdonly(sb)) { txQuiesce(sb); rc = lmLogShutdown(log); if (rc) { jfs_error(sb, "lmLogShutdown failed\n"); /* let operations fail rather than hang */ txResume(sb); return rc; } rc = updateSuper(sb, FM_CLEAN); if (rc) { jfs_err("jfs_freeze: updateSuper failed"); /* * Don't fail here. Everything succeeded except * marking the superblock clean, so there's really * no harm in leaving it frozen for now. */ } } return 0; } static int jfs_unfreeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; int rc = 0; if (!sb_rdonly(sb)) { rc = updateSuper(sb, FM_MOUNT); if (rc) { jfs_error(sb, "updateSuper failed\n"); goto out; } rc = lmLogInit(log); if (rc) jfs_error(sb, "lmLogInit failed\n"); out: txResume(sb); } return rc; } static int jfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) { struct jfs_log *log = JFS_SBI(sb)->log; /* log == NULL indicates read-only mount */ if (log) { /* * Write quota structures to quota file, sync_blockdev() will * write them to disk later */ dquot_writeback_dquots(sb, -1); jfs_flush_journal(log, wait); jfs_syncpt(log, 0); } return 0; } static int jfs_show_options(struct seq_file *seq, struct dentry *root) { struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); if (uid_valid(sbi->uid)) seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid)); if (gid_valid(sbi->gid)) seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid)); if (sbi->umask != -1) seq_printf(seq, ",umask=%03o", sbi->umask); if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); if (sbi->flag & JFS_DISCARD) seq_printf(seq, ",discard=%u", sbi->minblks_trim); if (sbi->nls_tab) seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); if (sbi->flag & JFS_ERR_CONTINUE) seq_printf(seq, ",errors=continue"); if (sbi->flag & JFS_ERR_PANIC) seq_printf(seq, ",errors=panic"); #ifdef CONFIG_QUOTA if (sbi->flag & JFS_USRQUOTA) seq_puts(seq, ",usrquota"); if (sbi->flag & JFS_GRPQUOTA) seq_puts(seq, ",grpquota"); #endif return 0; } #ifdef CONFIG_QUOTA /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; sector_t blk = off >> sb->s_blocksize_bits; int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head tmp_bh; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 0); if (err) return err; if (!buffer_mapped(&tmp_bh)) /* A hole? */ memset(data, 0, tocopy); else { bh = sb_bread(sb, tmp_bh.b_blocknr); if (!bh) return -EIO; memcpy(data, bh->b_data+offset, tocopy); brelse(bh); } offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile */ static ssize_t jfs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; sector_t blk = off >> sb->s_blocksize_bits; int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t towrite = len; struct buffer_head tmp_bh; struct buffer_head *bh; inode_lock(inode); while (towrite > 0) { tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; if (offset || tocopy != sb->s_blocksize) bh = sb_bread(sb, tmp_bh.b_blocknr); else bh = sb_getblk(sb, tmp_bh.b_blocknr); if (!bh) { err = -EIO; goto out; } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); brelse(bh); offset = 0; towrite -= tocopy; data += tocopy; blk++; } out: if (len == towrite) { inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; } static struct dquot __rcu **jfs_get_dquots(struct inode *inode) { return JFS_IP(inode)->i_dquot; } static int jfs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; struct inode *inode; err = dquot_quota_on(sb, type, format_id, path); if (err) return err; inode = d_inode(path->dentry); inode_lock(inode); JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); mark_inode_dirty(inode); return 0; } static int jfs_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; int err; if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err) goto out_put; inode_lock(inode); JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); mark_inode_dirty(inode); out_put: iput(inode); return err; out: return dquot_quota_off(sb, type); } #endif static const struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, .free_inode = jfs_free_inode, .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .evict_inode = jfs_evict_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, .show_options = jfs_show_options, #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, .quota_write = jfs_quota_write, .get_dquots = jfs_get_dquots, #endif }; static const struct export_operations jfs_export_operations = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = jfs_fh_to_dentry, .fh_to_parent = jfs_fh_to_parent, .get_parent = jfs_get_parent, }; static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx) { if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; /* Copy over current option values and mount flags */ ctx->uid = JFS_SBI(sb)->uid; ctx->gid = JFS_SBI(sb)->gid; ctx->umask = JFS_SBI(sb)->umask; ctx->nls_map = (void *)-1; ctx->minblks_trim = JFS_SBI(sb)->minblks_trim; ctx->flag = JFS_SBI(sb)->flag; } else { /* * Initialize the mount flag and determine the default * error handler */ ctx->flag = JFS_ERR_REMOUNT_RO; ctx->uid = INVALID_UID; ctx->gid = INVALID_GID; ctx->umask = -1; ctx->nls_map = (void *)-1; } } static void jfs_free_fc(struct fs_context *fc) { struct jfs_context *ctx = fc->fs_private; if (ctx->nls_map != (void *) -1) unload_nls(ctx->nls_map); kfree(ctx); } static const struct fs_context_operations jfs_context_ops = { .parse_param = jfs_parse_param, .get_tree = jfs_get_tree, .reconfigure = jfs_reconfigure, .free = jfs_free_fc, }; static int jfs_init_fs_context(struct fs_context *fc) { struct jfs_context *ctx; ctx = kzalloc_obj(*ctx); if (!ctx) return -ENOMEM; jfs_init_options(fc, ctx); fc->fs_private = ctx; fc->ops = &jfs_context_ops; return 0; } static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = jfs_init_fs_context, .parameters = jfs_param_spec, }; MODULE_ALIAS_FS("jfs"); static void init_once(void *foo) { struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); INIT_LIST_HEAD(&jfs_ip->anon_inode_list); init_rwsem(&jfs_ip->rdwrlock); mutex_init(&jfs_ip->commit_mutex); init_rwsem(&jfs_ip->xattr_sem); spin_lock_init(&jfs_ip->ag_lock); jfs_ip->active_ag = -1; inode_init_once(&jfs_ip->vfs_inode); } static int __init init_jfs_fs(void) { int i; int rc; jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, offsetof(struct jfs_inode_info, i_inline_all), sizeof_field(struct jfs_inode_info, i_inline_all), init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; /* * Metapage initialization */ rc = metapage_init(); if (rc) { jfs_err("metapage_init failed w/rc = %d", rc); goto free_slab; } /* * Transaction Manager initialization */ rc = txInit(); if (rc) { jfs_err("txInit failed w/rc = %d", rc); goto free_metapage; } /* * I/O completion thread (endio) */ jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO"); if (IS_ERR(jfsIOthread)) { rc = PTR_ERR(jfsIOthread); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); goto end_txmngr; } if (commit_threads < 1) commit_threads = num_online_cpus(); if (commit_threads > MAX_COMMIT_THREADS) commit_threads = MAX_COMMIT_THREADS; for (i = 0; i < commit_threads; i++) { jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit"); if (IS_ERR(jfsCommitThread[i])) { rc = PTR_ERR(jfsCommitThread[i]); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); commit_threads = i; goto kill_committask; } } jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync"); if (IS_ERR(jfsSyncThread)) { rc = PTR_ERR(jfsSyncThread); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); goto kill_committask; } #ifdef PROC_FS_JFS jfs_proc_init(); #endif rc = register_filesystem(&jfs_fs_type); if (!rc) return 0; #ifdef PROC_FS_JFS jfs_proc_clean(); #endif kthread_stop(jfsSyncThread); kill_committask: for (i = 0; i < commit_threads; i++) kthread_stop(jfsCommitThread[i]); kthread_stop(jfsIOthread); end_txmngr: txExit(); free_metapage: metapage_exit(); free_slab: kmem_cache_destroy(jfs_inode_cachep); return rc; } static void __exit exit_jfs_fs(void) { int i; jfs_info("exit_jfs_fs called"); txExit(); metapage_exit(); kthread_stop(jfsIOthread); for (i = 0; i < commit_threads; i++) kthread_stop(jfsCommitThread[i]); kthread_stop(jfsSyncThread); #ifdef PROC_FS_JFS jfs_proc_clean(); #endif unregister_filesystem(&jfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(jfs_inode_cachep); } module_init(init_jfs_fs) module_exit(exit_jfs_fs) |
| 1 1 1 1 1 1 9 9 9 8 8 8 8 8 8 8 8 1 9 8 8 8 11 11 11 11 11 11 11 11 2 2 11 11 10 8 8 8 8 8 8 8 8 8 8 8 11 11 11 2 2 11 11 2 11 11 11 17 1 1 1 1 11 2 11 2 2 2 2 2 2 2 2 1 1 1 2 2 3 3 1 1 3 3 3 1 1 4 4 1 1 4 4 1 1 1 2 2 2 2 2 2 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/slab.h> #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_doorbell.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" /* Use a wide upper bound for the maximum contexts. */ #define VMCI_MAX_CONTEXTS 2000 /* * List of current VMCI contexts. Contexts can be added by * vmci_ctx_create() and removed via vmci_ctx_destroy(). * These, along with context lookup, are protected by the * list structure's lock. */ static struct { struct list_head head; spinlock_t lock; /* Spinlock for context list operations */ } ctx_list = { .head = LIST_HEAD_INIT(ctx_list.head), .lock = __SPIN_LOCK_UNLOCKED(ctx_list.lock), }; /* Used by contexts that did not set up notify flag pointers */ static bool ctx_dummy_notify; static void ctx_signal_notify(struct vmci_ctx *context) { *context->notify = true; } static void ctx_clear_notify(struct vmci_ctx *context) { *context->notify = false; } /* * If nothing requires the attention of the guest, clears both * notify flag and call. */ static void ctx_clear_notify_call(struct vmci_ctx *context) { if (context->pending_datagrams == 0 && vmci_handle_arr_get_size(context->pending_doorbell_array) == 0) ctx_clear_notify(context); } /* * Sets the context's notify flag iff datagrams are pending for this * context. Called from vmci_setup_notify(). */ void vmci_ctx_check_signal_notify(struct vmci_ctx *context) { spin_lock(&context->lock); if (context->pending_datagrams) ctx_signal_notify(context); spin_unlock(&context->lock); } /* * Allocates and initializes a VMCI context. */ struct vmci_ctx *vmci_ctx_create(u32 cid, u32 priv_flags, uintptr_t event_hnd, int user_version, const struct cred *cred) { struct vmci_ctx *context; int error; if (cid == VMCI_INVALID_ID) { pr_devel("Invalid context ID for VMCI context\n"); error = -EINVAL; goto err_out; } if (priv_flags & ~VMCI_PRIVILEGE_ALL_FLAGS) { pr_devel("Invalid flag (flags=0x%x) for VMCI context\n", priv_flags); error = -EINVAL; goto err_out; } if (user_version == 0) { pr_devel("Invalid suer_version %d\n", user_version); error = -EINVAL; goto err_out; } context = kzalloc_obj(*context); if (!context) { pr_warn("Failed to allocate memory for VMCI context\n"); error = -ENOMEM; goto err_out; } kref_init(&context->kref); spin_lock_init(&context->lock); INIT_LIST_HEAD(&context->list_item); INIT_LIST_HEAD(&context->datagram_queue); INIT_LIST_HEAD(&context->notifier_list); /* Initialize host-specific VMCI context. */ init_waitqueue_head(&context->host_context.wait_queue); context->queue_pair_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_QP_COUNT); if (!context->queue_pair_array) { error = -ENOMEM; goto err_free_ctx; } context->doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->doorbell_array) { error = -ENOMEM; goto err_free_qp_array; } context->pending_doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->pending_doorbell_array) { error = -ENOMEM; goto err_free_db_array; } context->user_version = user_version; context->priv_flags = priv_flags; if (cred) context->cred = get_cred(cred); context->notify = &ctx_dummy_notify; context->notify_page = NULL; /* * If we collide with an existing context we generate a new * and use it instead. The VMX will determine if regeneration * is okay. Since there isn't 4B - 16 VMs running on a given * host, the below loop will terminate. */ spin_lock(&ctx_list.lock); while (vmci_ctx_exists(cid)) { /* We reserve the lowest 16 ids for fixed contexts. */ cid = max(cid, VMCI_RESERVED_CID_LIMIT - 1) + 1; if (cid == VMCI_INVALID_ID) cid = VMCI_RESERVED_CID_LIMIT; } context->cid = cid; list_add_tail_rcu(&context->list_item, &ctx_list.head); spin_unlock(&ctx_list.lock); return context; err_free_db_array: vmci_handle_arr_destroy(context->doorbell_array); err_free_qp_array: vmci_handle_arr_destroy(context->queue_pair_array); err_free_ctx: kfree(context); err_out: return ERR_PTR(error); } /* * Destroy VMCI context. */ void vmci_ctx_destroy(struct vmci_ctx *context) { spin_lock(&ctx_list.lock); list_del_rcu(&context->list_item); spin_unlock(&ctx_list.lock); synchronize_rcu(); vmci_ctx_put(context); } /* * Fire notification for all contexts interested in given cid. */ static int ctx_fire_notification(u32 context_id, u32 priv_flags) { u32 i, array_size; struct vmci_ctx *sub_ctx; struct vmci_handle_arr *subscriber_array; struct vmci_handle context_handle = vmci_make_handle(context_id, VMCI_EVENT_HANDLER); /* * We create an array to hold the subscribers we find when * scanning through all contexts. */ subscriber_array = vmci_handle_arr_create(0, VMCI_MAX_CONTEXTS); if (subscriber_array == NULL) return VMCI_ERROR_NO_MEM; /* * Scan all contexts to find who is interested in being * notified about given contextID. */ rcu_read_lock(); list_for_each_entry_rcu(sub_ctx, &ctx_list.head, list_item) { struct vmci_handle_list *node; /* * We only deliver notifications of the removal of * contexts, if the two contexts are allowed to * interact. */ if (vmci_deny_interaction(priv_flags, sub_ctx->priv_flags)) continue; list_for_each_entry_rcu(node, &sub_ctx->notifier_list, node) { if (!vmci_handle_is_equal(node->handle, context_handle)) continue; vmci_handle_arr_append_entry(&subscriber_array, vmci_make_handle(sub_ctx->cid, VMCI_EVENT_HANDLER)); } } rcu_read_unlock(); /* Fire event to all subscribers. */ array_size = vmci_handle_arr_get_size(subscriber_array); for (i = 0; i < array_size; i++) { int result; struct vmci_event_ctx ev; ev.msg.hdr.dst = vmci_handle_arr_get_entry(subscriber_array, i); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); memset((char*)&ev + sizeof(ev.msg.hdr), 0, ev.msg.hdr.payload_size); ev.msg.event_data.event = VMCI_EVENT_CTX_REMOVED; ev.payload.context_id = context_id; result = vmci_datagram_dispatch(VMCI_HYPERVISOR_CONTEXT_ID, &ev.msg.hdr, false); if (result < VMCI_SUCCESS) { pr_devel("Failed to enqueue event datagram (type=%d) for context (ID=0x%x)\n", ev.msg.event_data.event, ev.msg.hdr.dst.context); /* We continue to enqueue on next subscriber. */ } } vmci_handle_arr_destroy(subscriber_array); return VMCI_SUCCESS; } /* * Queues a VMCI datagram for the appropriate target VM context. */ int vmci_ctx_enqueue_datagram(u32 cid, struct vmci_datagram *dg) { struct vmci_datagram_queue_entry *dq_entry; struct vmci_ctx *context; struct vmci_handle dg_src; size_t vmci_dg_size; vmci_dg_size = VMCI_DG_SIZE(dg); if (vmci_dg_size > VMCI_MAX_DG_SIZE) { pr_devel("Datagram too large (bytes=%zu)\n", vmci_dg_size); return VMCI_ERROR_INVALID_ARGS; } /* Get the target VM's VMCI context. */ context = vmci_ctx_get(cid); if (!context) { pr_devel("Invalid context (ID=0x%x)\n", cid); return VMCI_ERROR_INVALID_ARGS; } /* Allocate guest call entry and add it to the target VM's queue. */ dq_entry = kmalloc_obj(*dq_entry); if (dq_entry == NULL) { pr_warn("Failed to allocate memory for datagram\n"); vmci_ctx_put(context); return VMCI_ERROR_NO_MEM; } dq_entry->dg = dg; dq_entry->dg_size = vmci_dg_size; dg_src = dg->src; INIT_LIST_HEAD(&dq_entry->list_item); spin_lock(&context->lock); /* * We put a higher limit on datagrams from the hypervisor. If * the pending datagram is not from hypervisor, then we check * if enqueueing it would exceed the * VMCI_MAX_DATAGRAM_QUEUE_SIZE limit on the destination. If * the pending datagram is from hypervisor, we allow it to be * queued at the destination side provided we don't reach the * VMCI_MAX_DATAGRAM_AND_EVENT_QUEUE_SIZE limit. */ if (context->datagram_queue_size + vmci_dg_size >= VMCI_MAX_DATAGRAM_QUEUE_SIZE && (!vmci_handle_is_equal(dg_src, vmci_make_handle (VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID)) || context->datagram_queue_size + vmci_dg_size >= VMCI_MAX_DATAGRAM_AND_EVENT_QUEUE_SIZE)) { spin_unlock(&context->lock); vmci_ctx_put(context); kfree(dq_entry); pr_devel("Context (ID=0x%x) receive queue is full\n", cid); return VMCI_ERROR_NO_RESOURCES; } list_add(&dq_entry->list_item, &context->datagram_queue); context->pending_datagrams++; context->datagram_queue_size += vmci_dg_size; ctx_signal_notify(context); wake_up(&context->host_context.wait_queue); spin_unlock(&context->lock); vmci_ctx_put(context); return vmci_dg_size; } /* * Verifies whether a context with the specified context ID exists. * FIXME: utility is dubious as no decisions can be reliably made * using this data as context can appear and disappear at any time. */ bool vmci_ctx_exists(u32 cid) { struct vmci_ctx *context; bool exists = false; rcu_read_lock(); list_for_each_entry_rcu(context, &ctx_list.head, list_item) { if (context->cid == cid) { exists = true; break; } } rcu_read_unlock(); return exists; } /* * Retrieves VMCI context corresponding to the given cid. */ struct vmci_ctx *vmci_ctx_get(u32 cid) { struct vmci_ctx *c, *context = NULL; if (cid == VMCI_INVALID_ID) return NULL; rcu_read_lock(); list_for_each_entry_rcu(c, &ctx_list.head, list_item) { if (c->cid == cid) { /* * The context owner drops its own reference to the * context only after removing it from the list and * waiting for RCU grace period to expire. This * means that we are not about to increase the * reference count of something that is in the * process of being destroyed. */ context = c; kref_get(&context->kref); break; } } rcu_read_unlock(); return context; } /* * Deallocates all parts of a context data structure. This * function doesn't lock the context, because it assumes that * the caller was holding the last reference to context. */ static void ctx_free_ctx(struct kref *kref) { struct vmci_ctx *context = container_of(kref, struct vmci_ctx, kref); struct vmci_datagram_queue_entry *dq_entry, *dq_entry_tmp; struct vmci_handle temp_handle; struct vmci_handle_list *notifier, *tmp; /* * Fire event to all contexts interested in knowing this * context is dying. */ ctx_fire_notification(context->cid, context->priv_flags); /* * Cleanup all queue pair resources attached to context. If * the VM dies without cleaning up, this code will make sure * that no resources are leaked. */ temp_handle = vmci_handle_arr_get_entry(context->queue_pair_array, 0); while (!vmci_handle_is_equal(temp_handle, VMCI_INVALID_HANDLE)) { if (vmci_qp_broker_detach(temp_handle, context) < VMCI_SUCCESS) { /* * When vmci_qp_broker_detach() succeeds it * removes the handle from the array. If * detach fails, we must remove the handle * ourselves. */ vmci_handle_arr_remove_entry(context->queue_pair_array, temp_handle); } temp_handle = vmci_handle_arr_get_entry(context->queue_pair_array, 0); } /* * It is fine to destroy this without locking the callQueue, as * this is the only thread having a reference to the context. */ list_for_each_entry_safe(dq_entry, dq_entry_tmp, &context->datagram_queue, list_item) { WARN_ON(dq_entry->dg_size != VMCI_DG_SIZE(dq_entry->dg)); list_del(&dq_entry->list_item); kfree(dq_entry->dg); kfree(dq_entry); } list_for_each_entry_safe(notifier, tmp, &context->notifier_list, node) { list_del(¬ifier->node); kfree(notifier); } vmci_handle_arr_destroy(context->queue_pair_array); vmci_handle_arr_destroy(context->doorbell_array); vmci_handle_arr_destroy(context->pending_doorbell_array); vmci_ctx_unset_notify(context); if (context->cred) put_cred(context->cred); kfree(context); } /* * Drops reference to VMCI context. If this is the last reference to * the context it will be deallocated. A context is created with * a reference count of one, and on destroy, it is removed from * the context list before its reference count is decremented. Thus, * if we reach zero, we are sure that nobody else are about to increment * it (they need the entry in the context list for that), and so there * is no need for locking. */ void vmci_ctx_put(struct vmci_ctx *context) { kref_put(&context->kref, ctx_free_ctx); } /* * Dequeues the next datagram and returns it to caller. * The caller passes in a pointer to the max size datagram * it can handle and the datagram is only unqueued if the * size is less than max_size. If larger max_size is set to * the size of the datagram to give the caller a chance to * set up a larger buffer for the guestcall. */ int vmci_ctx_dequeue_datagram(struct vmci_ctx *context, size_t *max_size, struct vmci_datagram **dg) { struct vmci_datagram_queue_entry *dq_entry; struct list_head *list_item; int rv; /* Dequeue the next datagram entry. */ spin_lock(&context->lock); if (context->pending_datagrams == 0) { ctx_clear_notify_call(context); spin_unlock(&context->lock); pr_devel("No datagrams pending\n"); return VMCI_ERROR_NO_MORE_DATAGRAMS; } list_item = context->datagram_queue.next; dq_entry = list_entry(list_item, struct vmci_datagram_queue_entry, list_item); /* Check size of caller's buffer. */ if (*max_size < dq_entry->dg_size) { *max_size = dq_entry->dg_size; spin_unlock(&context->lock); pr_devel("Caller's buffer should be at least (size=%u bytes)\n", (u32) *max_size); return VMCI_ERROR_NO_MEM; } list_del(list_item); context->pending_datagrams--; context->datagram_queue_size -= dq_entry->dg_size; if (context->pending_datagrams == 0) { ctx_clear_notify_call(context); rv = VMCI_SUCCESS; } else { /* * Return the size of the next datagram. */ struct vmci_datagram_queue_entry *next_entry; list_item = context->datagram_queue.next; next_entry = list_entry(list_item, struct vmci_datagram_queue_entry, list_item); /* * The following size_t -> int truncation is fine as * the maximum size of a (routable) datagram is 68KB. */ rv = (int)next_entry->dg_size; } spin_unlock(&context->lock); /* Caller must free datagram. */ *dg = dq_entry->dg; dq_entry->dg = NULL; kfree(dq_entry); return rv; } /* * Reverts actions set up by vmci_setup_notify(). Unmaps and unlocks the * page mapped/locked by vmci_setup_notify(). */ void vmci_ctx_unset_notify(struct vmci_ctx *context) { struct page *notify_page; spin_lock(&context->lock); notify_page = context->notify_page; context->notify = &ctx_dummy_notify; context->notify_page = NULL; spin_unlock(&context->lock); if (notify_page) { kunmap(notify_page); put_page(notify_page); } } /* * Add remote_cid to list of contexts current contexts wants * notifications from/about. */ int vmci_ctx_add_notification(u32 context_id, u32 remote_cid) { struct vmci_ctx *context; struct vmci_handle_list *notifier, *n; int result; bool exists = false; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(remote_cid)) { pr_devel("Context removed notifications for other VMs not supported (src=0x%x, remote=0x%x)\n", context_id, remote_cid); result = VMCI_ERROR_DST_UNREACHABLE; goto out; } if (context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED) { result = VMCI_ERROR_NO_ACCESS; goto out; } notifier = kmalloc_obj(struct vmci_handle_list); if (!notifier) { result = VMCI_ERROR_NO_MEM; goto out; } INIT_LIST_HEAD(¬ifier->node); notifier->handle = vmci_make_handle(remote_cid, VMCI_EVENT_HANDLER); spin_lock(&context->lock); if (context->n_notifiers < VMCI_MAX_CONTEXTS) { list_for_each_entry(n, &context->notifier_list, node) { if (vmci_handle_is_equal(n->handle, notifier->handle)) { exists = true; break; } } if (exists) { kfree(notifier); result = VMCI_ERROR_ALREADY_EXISTS; } else { list_add_tail_rcu(¬ifier->node, &context->notifier_list); context->n_notifiers++; result = VMCI_SUCCESS; } } else { kfree(notifier); result = VMCI_ERROR_NO_MEM; } spin_unlock(&context->lock); out: vmci_ctx_put(context); return result; } /* * Remove remote_cid from current context's list of contexts it is * interested in getting notifications from/about. */ int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid) { struct vmci_ctx *context; struct vmci_handle_list *notifier = NULL, *iter, *tmp; struct vmci_handle handle; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; handle = vmci_make_handle(remote_cid, VMCI_EVENT_HANDLER); spin_lock(&context->lock); list_for_each_entry_safe(iter, tmp, &context->notifier_list, node) { if (vmci_handle_is_equal(iter->handle, handle)) { list_del_rcu(&iter->node); context->n_notifiers--; notifier = iter; break; } } spin_unlock(&context->lock); if (notifier) kvfree_rcu_mightsleep(notifier); vmci_ctx_put(context); return notifier ? VMCI_SUCCESS : VMCI_ERROR_NOT_FOUND; } static int vmci_ctx_get_chkpt_notifiers(struct vmci_ctx *context, u32 *buf_size, void **pbuf) { u32 *notifiers; size_t data_size; struct vmci_handle_list *entry; int i = 0; if (context->n_notifiers == 0) { *buf_size = 0; *pbuf = NULL; return VMCI_SUCCESS; } data_size = context->n_notifiers * sizeof(*notifiers); if (*buf_size < data_size) { *buf_size = data_size; return VMCI_ERROR_MORE_DATA; } notifiers = kmalloc(data_size, GFP_ATOMIC); /* FIXME: want GFP_KERNEL */ if (!notifiers) return VMCI_ERROR_NO_MEM; list_for_each_entry(entry, &context->notifier_list, node) notifiers[i++] = entry->handle.context; *buf_size = data_size; *pbuf = notifiers; return VMCI_SUCCESS; } static int vmci_ctx_get_chkpt_doorbells(struct vmci_ctx *context, u32 *buf_size, void **pbuf) { struct dbell_cpt_state *dbells; u32 i, n_doorbells; n_doorbells = vmci_handle_arr_get_size(context->doorbell_array); if (n_doorbells > 0) { size_t data_size = n_doorbells * sizeof(*dbells); if (*buf_size < data_size) { *buf_size = data_size; return VMCI_ERROR_MORE_DATA; } dbells = kzalloc(data_size, GFP_ATOMIC); if (!dbells) return VMCI_ERROR_NO_MEM; for (i = 0; i < n_doorbells; i++) dbells[i].handle = vmci_handle_arr_get_entry( context->doorbell_array, i); *buf_size = data_size; *pbuf = dbells; } else { *buf_size = 0; *pbuf = NULL; } return VMCI_SUCCESS; } /* * Get current context's checkpoint state of given type. */ int vmci_ctx_get_chkpt_state(u32 context_id, u32 cpt_type, u32 *buf_size, void **pbuf) { struct vmci_ctx *context; int result; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); switch (cpt_type) { case VMCI_NOTIFICATION_CPT_STATE: result = vmci_ctx_get_chkpt_notifiers(context, buf_size, pbuf); break; case VMCI_WELLKNOWN_CPT_STATE: /* * For compatibility with VMX'en with VM to VM communication, we * always return zero wellknown handles. */ *buf_size = 0; *pbuf = NULL; result = VMCI_SUCCESS; break; case VMCI_DOORBELL_CPT_STATE: result = vmci_ctx_get_chkpt_doorbells(context, buf_size, pbuf); break; default: pr_devel("Invalid cpt state (type=%d)\n", cpt_type); result = VMCI_ERROR_INVALID_ARGS; break; } spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Set current context's checkpoint state of given type. */ int vmci_ctx_set_chkpt_state(u32 context_id, u32 cpt_type, u32 buf_size, void *cpt_buf) { u32 i; u32 current_id; int result = VMCI_SUCCESS; u32 num_ids = buf_size / sizeof(u32); if (cpt_type == VMCI_WELLKNOWN_CPT_STATE && num_ids > 0) { /* * We would end up here if VMX with VM to VM communication * attempts to restore a checkpoint with wellknown handles. */ pr_warn("Attempt to restore checkpoint with obsolete wellknown handles\n"); return VMCI_ERROR_OBSOLETE; } if (cpt_type != VMCI_NOTIFICATION_CPT_STATE) { pr_devel("Invalid cpt state (type=%d)\n", cpt_type); return VMCI_ERROR_INVALID_ARGS; } for (i = 0; i < num_ids && result == VMCI_SUCCESS; i++) { current_id = ((u32 *)cpt_buf)[i]; result = vmci_ctx_add_notification(context_id, current_id); if (result != VMCI_SUCCESS) break; } if (result != VMCI_SUCCESS) pr_devel("Failed to set cpt state (type=%d) (error=%d)\n", cpt_type, result); return result; } /* * Retrieves the specified context's pending notifications in the * form of a handle array. The handle arrays returned are the * actual data - not a copy and should not be modified by the * caller. They must be released using * vmci_ctx_rcv_notifications_release. */ int vmci_ctx_rcv_notifications_get(u32 context_id, struct vmci_handle_arr **db_handle_array, struct vmci_handle_arr **qp_handle_array) { struct vmci_ctx *context; int result = VMCI_SUCCESS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); *db_handle_array = context->pending_doorbell_array; context->pending_doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->pending_doorbell_array) { context->pending_doorbell_array = *db_handle_array; *db_handle_array = NULL; result = VMCI_ERROR_NO_MEM; } *qp_handle_array = NULL; spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Releases handle arrays with pending notifications previously * retrieved using vmci_ctx_rcv_notifications_get. If the * notifications were not successfully handed over to the guest, * success must be false. */ void vmci_ctx_rcv_notifications_release(u32 context_id, struct vmci_handle_arr *db_handle_array, struct vmci_handle_arr *qp_handle_array, bool success) { struct vmci_ctx *context = vmci_ctx_get(context_id); spin_lock(&context->lock); if (!success) { struct vmci_handle handle; /* * New notifications may have been added while we were not * holding the context lock, so we transfer any new pending * doorbell notifications to the old array, and reinstate the * old array. */ handle = vmci_handle_arr_remove_tail( context->pending_doorbell_array); while (!vmci_handle_is_invalid(handle)) { if (!vmci_handle_arr_has_entry(db_handle_array, handle)) { vmci_handle_arr_append_entry( &db_handle_array, handle); } handle = vmci_handle_arr_remove_tail( context->pending_doorbell_array); } vmci_handle_arr_destroy(context->pending_doorbell_array); context->pending_doorbell_array = db_handle_array; db_handle_array = NULL; } else { ctx_clear_notify_call(context); } spin_unlock(&context->lock); vmci_ctx_put(context); if (db_handle_array) vmci_handle_arr_destroy(db_handle_array); if (qp_handle_array) vmci_handle_arr_destroy(qp_handle_array); } /* * Registers that a new doorbell handle has been allocated by the * context. Only doorbell handles registered can be notified. */ int vmci_ctx_dbell_create(u32 context_id, struct vmci_handle handle) { struct vmci_ctx *context; int result; if (context_id == VMCI_INVALID_ID || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); if (!vmci_handle_arr_has_entry(context->doorbell_array, handle)) result = vmci_handle_arr_append_entry(&context->doorbell_array, handle); else result = VMCI_ERROR_DUPLICATE_ENTRY; spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Unregisters a doorbell handle that was previously registered * with vmci_ctx_dbell_create. */ int vmci_ctx_dbell_destroy(u32 context_id, struct vmci_handle handle) { struct vmci_ctx *context; struct vmci_handle removed_handle; if (context_id == VMCI_INVALID_ID || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); removed_handle = vmci_handle_arr_remove_entry(context->doorbell_array, handle); vmci_handle_arr_remove_entry(context->pending_doorbell_array, handle); spin_unlock(&context->lock); vmci_ctx_put(context); return vmci_handle_is_invalid(removed_handle) ? VMCI_ERROR_NOT_FOUND : VMCI_SUCCESS; } /* * Registers a notification of a doorbell handle initiated by the * specified source context. The notification of doorbells are * subject to the same isolation rules as datagram delivery. To * allow host side senders of notifications a finer granularity * of sender rights than those assigned to the sending context * itself, the host context is required to specify a different * set of privilege flags that will override the privileges of * the source context. */ int vmci_ctx_notify_dbell(u32 src_cid, struct vmci_handle handle, u32 src_priv_flags) { struct vmci_ctx *dst_context; int result; if (vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; /* Get the target VM's VMCI context. */ dst_context = vmci_ctx_get(handle.context); if (!dst_context) { pr_devel("Invalid context (ID=0x%x)\n", handle.context); return VMCI_ERROR_NOT_FOUND; } if (src_cid != handle.context) { u32 dst_priv_flags; if (VMCI_CONTEXT_IS_VM(src_cid) && VMCI_CONTEXT_IS_VM(handle.context)) { pr_devel("Doorbell notification from VM to VM not supported (src=0x%x, dst=0x%x)\n", src_cid, handle.context); result = VMCI_ERROR_DST_UNREACHABLE; goto out; } result = vmci_dbell_get_priv_flags(handle, &dst_priv_flags); if (result < VMCI_SUCCESS) { pr_warn("Failed to get privilege flags for destination (handle=0x%x:0x%x)\n", handle.context, handle.resource); goto out; } if (src_cid != VMCI_HOST_CONTEXT_ID || src_priv_flags == VMCI_NO_PRIVILEGE_FLAGS) { src_priv_flags = vmci_context_get_priv_flags(src_cid); } if (vmci_deny_interaction(src_priv_flags, dst_priv_flags)) { result = VMCI_ERROR_NO_ACCESS; goto out; } } if (handle.context == VMCI_HOST_CONTEXT_ID) { result = vmci_dbell_host_context_notify(src_cid, handle); } else { spin_lock(&dst_context->lock); if (!vmci_handle_arr_has_entry(dst_context->doorbell_array, handle)) { result = VMCI_ERROR_NOT_FOUND; } else { if (!vmci_handle_arr_has_entry( dst_context->pending_doorbell_array, handle)) { result = vmci_handle_arr_append_entry( &dst_context->pending_doorbell_array, handle); if (result == VMCI_SUCCESS) { ctx_signal_notify(dst_context); wake_up(&dst_context->host_context.wait_queue); } } else { result = VMCI_SUCCESS; } } spin_unlock(&dst_context->lock); } out: vmci_ctx_put(dst_context); return result; } bool vmci_ctx_supports_host_qp(struct vmci_ctx *context) { return context && context->user_version >= VMCI_VERSION_HOSTQP; } /* * Registers that a new queue pair handle has been allocated by * the context. */ int vmci_ctx_qp_create(struct vmci_ctx *context, struct vmci_handle handle) { int result; if (context == NULL || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; if (!vmci_handle_arr_has_entry(context->queue_pair_array, handle)) result = vmci_handle_arr_append_entry( &context->queue_pair_array, handle); else result = VMCI_ERROR_DUPLICATE_ENTRY; return result; } /* * Unregisters a queue pair handle that was previously registered * with vmci_ctx_qp_create. */ int vmci_ctx_qp_destroy(struct vmci_ctx *context, struct vmci_handle handle) { struct vmci_handle hndl; if (context == NULL || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; hndl = vmci_handle_arr_remove_entry(context->queue_pair_array, handle); return vmci_handle_is_invalid(hndl) ? VMCI_ERROR_NOT_FOUND : VMCI_SUCCESS; } /* * Determines whether a given queue pair handle is registered * with the given context. */ bool vmci_ctx_qp_exists(struct vmci_ctx *context, struct vmci_handle handle) { if (context == NULL || vmci_handle_is_invalid(handle)) return false; return vmci_handle_arr_has_entry(context->queue_pair_array, handle); } /* * vmci_context_get_priv_flags() - Retrieve privilege flags. * @context_id: The context ID of the VMCI context. * * Retrieves privilege flags of the given VMCI context ID. */ u32 vmci_context_get_priv_flags(u32 context_id) { if (vmci_host_code_active()) { u32 flags; struct vmci_ctx *context; context = vmci_ctx_get(context_id); if (!context) return VMCI_LEAST_PRIVILEGE_FLAGS; flags = context->priv_flags; vmci_ctx_put(context); return flags; } return VMCI_NO_PRIVILEGE_FLAGS; } EXPORT_SYMBOL_GPL(vmci_context_get_priv_flags); /* * vmci_is_context_owner() - Determimnes if user is the context owner * @context_id: The context ID of the VMCI context. * @uid: The host user id (real kernel value). * * Determines whether a given UID is the owner of given VMCI context. */ bool vmci_is_context_owner(u32 context_id, kuid_t uid) { bool is_owner = false; if (vmci_host_code_active()) { struct vmci_ctx *context = vmci_ctx_get(context_id); if (context) { if (context->cred) is_owner = uid_eq(context->cred->uid, uid); vmci_ctx_put(context); } } return is_owner; } EXPORT_SYMBOL_GPL(vmci_is_context_owner); |
| 11 11 11 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/seq_file.h> #include <net/ip.h> #include <net/mptcp.h> #include <net/snmp.h> #include <net/net_namespace.h> #include "mib.h" static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK), SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX), SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX), SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE), SNMP_MIB_ITEM("MPCapableDataFallback", MPTCP_MIB_MPCAPABLEDATAFALLBACK), SNMP_MIB_ITEM("MD5SigFallback", MPTCP_MIB_MD5SIGFALLBACK), SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), }; /* mptcp_mib_alloc - allocate percpu mib counters * * These are allocated when the first mptcp socket is created so * we do not waste percpu memory if mptcp isn't in use. */ bool mptcp_mib_alloc(struct net *net) { struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); if (!mib) return false; if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) free_percpu(mib); return true; } void mptcp_seq_show(struct seq_file *seq) { unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)]; const int cnt = ARRAY_SIZE(mptcp_snmp_list); struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); for (i = 0; i < cnt; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt, net->mib.mptcp_statistics); for (i = 0; i < cnt; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); } |
| 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. * https://www.oppo.com/ */ #include <linux/sysfs.h> #include <linux/kobject.h> #include "internal.h" #include "compress.h" enum { attr_feature, attr_drop_caches, attr_pointer_ui, attr_pointer_bool, attr_accel, }; enum { struct_erofs_sb_info, struct_erofs_mount_opts, }; struct erofs_attr { struct attribute attr; short attr_id; int struct_type, offset; }; #define EROFS_ATTR(_name, _mode, _id) \ static struct erofs_attr erofs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ } #define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) #define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) #define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ static struct erofs_attr erofs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ .struct_type = struct_##_struct, \ .offset = offsetof(struct _struct, _name),\ } #define EROFS_ATTR_RW(_name, _id, _struct) \ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) #define EROFS_RO_ATTR(_name, _id, _struct) \ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) #define EROFS_ATTR_RW_UI(_name, _struct) \ EROFS_ATTR_RW(_name, pointer_ui, _struct) #define EROFS_ATTR_RW_BOOL(_name, _struct) \ EROFS_ATTR_RW(_name, pointer_bool, _struct) #define ATTR_LIST(name) (&erofs_attr_##name.attr) #ifdef CONFIG_EROFS_FS_ZIP EROFS_ATTR_RW_UI(sync_decompress, erofs_sb_info); EROFS_ATTR_FUNC(drop_caches, 0200); #endif #ifdef CONFIG_EROFS_FS_ZIP_ACCEL EROFS_ATTR_FUNC(accel, 0644); #endif EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info); static struct attribute *erofs_sb_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), ATTR_LIST(drop_caches), #endif ATTR_LIST(dir_ra_bytes), NULL, }; ATTRIBUTE_GROUPS(erofs_sb); static struct attribute *erofs_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP_ACCEL ATTR_LIST(accel), #endif NULL, }; ATTRIBUTE_GROUPS(erofs); /* Features this copy of erofs supports */ EROFS_ATTR_FEATURE(compr_cfgs); EROFS_ATTR_FEATURE(big_pcluster); EROFS_ATTR_FEATURE(chunked_file); EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); EROFS_ATTR_FEATURE(48bit); EROFS_ATTR_FEATURE(metabox); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(compr_cfgs), ATTR_LIST(big_pcluster), ATTR_LIST(chunked_file), ATTR_LIST(device_table), ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), ATTR_LIST(48bit), ATTR_LIST(metabox), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, int struct_type, int offset) { if (struct_type == struct_erofs_sb_info) return (unsigned char *)sbi + offset; if (struct_type == struct_erofs_mount_opts) return (unsigned char *)&sbi->opt + offset; return NULL; } static ssize_t erofs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); switch (a->attr_id) { case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_pointer_ui: if (!ptr) return 0; return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); case attr_pointer_bool: if (!ptr) return 0; return sysfs_emit(buf, "%d\n", *(bool *)ptr); case attr_accel: return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n'); } return 0; } static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); unsigned long t; int ret; switch (a->attr_id) { case attr_pointer_ui: if (!ptr) return 0; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t != (unsigned int)t) return -ERANGE; if (IS_ENABLED(CONFIG_EROFS_FS_ZIP) && !strcmp(a->attr.name, "sync_decompress") && (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) return -EINVAL; *(unsigned int *)ptr = t; return len; case attr_pointer_bool: if (!ptr) return 0; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t != 0 && t != 1) return -EINVAL; *(bool *)ptr = !!t; return len; #ifdef CONFIG_EROFS_FS_ZIP case attr_drop_caches: ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t < 1 || t > 3) return -EINVAL; if (t & 2) z_erofs_shrink_scan(sbi, ~0UL); if (t & 1) invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); return len; #endif #ifdef CONFIG_EROFS_FS_ZIP_ACCEL case attr_accel: buf = skip_spaces(buf); z_erofs_crypto_disable_all_engines(); while (*buf) { t = strcspn(buf, "\n"); ret = z_erofs_crypto_enable_engine(buf, t); if (ret < 0) return ret; buf += buf[t] != '\0' ? t + 1 : t; } return len; #endif } return 0; } static void erofs_sb_release(struct kobject *kobj) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static const struct sysfs_ops erofs_attr_ops = { .show = erofs_attr_show, .store = erofs_attr_store, }; static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_sb_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; static const struct kobj_type erofs_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, }; static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; static struct kobject erofs_feat = { .kset = &erofs_root, }; int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", sb->s_sysfs_name); if (err) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } return err; } void erofs_unregister_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if (sbi->s_kobj.state_in_sysfs) { kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } } void erofs_exit_sysfs(void) { kobject_put(&erofs_feat); kset_unregister(&erofs_root); } int __init erofs_init_sysfs(void) { int ret; kobject_set_name(&erofs_root.kobj, "erofs"); erofs_root.kobj.parent = fs_kobj; ret = kset_register(&erofs_root); if (!ret) { ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, NULL, "features"); if (!ret) return 0; erofs_exit_sysfs(); } return ret; } |
| 89 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SEQ_FILE_NET_H__ #define __SEQ_FILE_NET_H__ #include <linux/seq_file.h> #include <net/net_trackers.h> struct net; extern struct net init_net; struct seq_net_private { #ifdef CONFIG_NET_NS struct net *net; netns_tracker ns_tracker; #endif }; static inline struct net *seq_file_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS return ((struct seq_net_private *)seq->private)->net; #else return &init_net; #endif } /* * This one is needed for proc_create_net_single since net is stored directly * in private not as a struct i.e. seq_file_net can't be used. */ static inline struct net *seq_file_single_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS return (struct net *)seq->private; #else return &init_net; #endif } #endif |
| 433 434 402 399 398 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. * * This code is based in part on work published here: * * https://github.com/IAIK/KAISER * * The original work was written by and signed off by for the Linux * kernel by: * * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> * * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> * Mostly rewritten by Thomas Gleixner <tglx@kernel.org> and * Andy Lutomirsky <luto@amacapital.net> */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> #include <asm/vsyscall.h> #include <asm/cmdline.h> #include <asm/pti.h> #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/sections.h> #include <asm/set_memory.h> #include <asm/bugs.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt /* Backporting helper */ #ifndef __GFP_NOTRACK #define __GFP_NOTRACK 0 #endif /* * Define the page-table levels we clone for user-space on 32 * and 64 bit. */ #ifdef CONFIG_X86_64 #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD #else #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE #endif static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } static void __init pti_print_if_secure(const char *reason) { if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } /* Assume mode is auto unless overridden via cmdline below. */ static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, PTI_FORCE_ON } pti_mode; void __init pti_check_boottime_disable(void) { if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } if (pti_mode == PTI_AUTO && !cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) pti_mode = PTI_FORCE_OFF; if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); return; } if (pti_mode == PTI_FORCE_ON) pti_print_if_secure("force enabled on command line."); if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; setup_force_cpu_cap(X86_FEATURE_PTI); if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { pr_debug("PTI enabled, disabling INVLPGB\n"); setup_clear_cpu_cap(X86_FEATURE_INVLPGB); } if (cpu_feature_enabled(X86_FEATURE_FRED)) { pr_debug("PTI enabled, disabling FRED\n"); setup_clear_cpu_cap(X86_FEATURE_FRED); } } static int __init pti_parse_cmdline(char *arg) { if (!strcmp(arg, "off")) pti_mode = PTI_FORCE_OFF; else if (!strcmp(arg, "on")) pti_mode = PTI_FORCE_ON; else if (!strcmp(arg, "auto")) pti_mode = PTI_AUTO; else return -EINVAL; return 0; } early_param("pti", pti_parse_cmdline); static int __init pti_parse_cmdline_nopti(char *arg) { pti_mode = PTI_FORCE_OFF; return 0; } early_param("nopti", pti_parse_cmdline_nopti); pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page * tables are not automatically propagated to the usermode tables. * * Users should keep in mind that, unlike the kernelmode tables, * there is no vmalloc_fault equivalent for the usermode tables. * Top-level entries added to init_mm's usermode pgd after boot * will not be automatically propagated to other mms. */ if (!pgdp_maps_userspace(pgdp) || (pgd.pgd & _PAGE_NOPTISHADOW)) return pgd; /* * The user page tables get the full PGD, accessible from * userspace: */ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; /* * If this is normal user memory, make it NX in the kernel * pagetables so that, if we somehow screw up and return to * usermode with the kernel CR3 loaded, we'll get a page fault * instead of allowing user code to execute with the wrong CR3. * * As exceptions, we don't set NX if: * - _PAGE_USER is not set. This could be an executable * EFI runtime mapping or something similar, and the kernel * may execute from it * - we don't have NX support * - we're clearing the PGD (i.e. the new pgd is not present). */ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && (__supported_pte_mask & _PAGE_NX)) pgd.pgd |= _PAGE_NX; /* return the copy of the PGD we want the kernel to use: */ return pgd; } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a P4D on success, or NULL on failure. */ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (address < PAGE_OFFSET) { WARN_ONCE(1, "attempt to walk user address\n"); return NULL; } if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_leaf(*pgd)); return p4d_offset(pgd, address); } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a PMD on success, or NULL on failure. */ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d; pud_t *pud; p4d = pti_user_pagetable_walk_p4d(address); if (!p4d) return NULL; BUILD_BUG_ON(p4d_leaf(*p4d)); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); } /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. * * Note: this is only used when mapping *new* kernel data into the * user/shadow page tables. It is never used for userspace data. * * Returns a pointer to a PTE on success, or NULL on failure. */ static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; pte_t *pte; pmd = pti_user_pagetable_walk_pmd(address); if (!pmd) return NULL; /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { /* Clear the PMD if we hit a large mapping from the first round */ if (late_text) { set_pmd(pmd, __pmd(0)); } else { WARN_ON_ONCE(1); return NULL; } } if (pmd_none(*pmd)) { unsigned long new_pte_page = __get_free_page(gfp); if (!new_pte_page) return NULL; set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); if (pte_flags(*pte) & _PAGE_USER) { WARN_ONCE(1, "attempt to walk to user pte\n"); return NULL; } return pte; } #ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; unsigned int level; pte = lookup_address(VSYSCALL_ADDR, &level); if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; *target_pte = *pte; set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); } #else static void __init pti_setup_vsyscall(void) { } #endif enum pti_clone_level { PTI_CLONE_PMD, PTI_CLONE_PTE, }; static void pti_clone_pgtable(unsigned long start, unsigned long end, enum pti_clone_level level, bool late_text) { unsigned long addr; /* * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ for (addr = start; addr < end;) { pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; pud_t *pud; /* Overflow check */ if (addr < start) break; pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; pud = pud_offset(p4d, addr); if (pud_none(*pud)) { WARN_ON_ONCE(addr & ~PUD_MASK); addr = round_up(addr + 1, PUD_SIZE); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { WARN_ON_ONCE(addr & ~PMD_MASK); addr = round_up(addr + 1, PMD_SIZE); continue; } if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) { target_pmd = pti_user_pagetable_walk_pmd(addr); if (WARN_ON(!target_pmd)) return; /* * Only clone present PMDs. This ensures only setting * _PAGE_GLOBAL on present PMDs. This should only be * called on well-known addresses anyway, so a non- * present PMD would be a surprise. */ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) return; /* * Setting 'target_pmd' below creates a mapping in both * the user and kernel page tables. It is effectively * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); /* * Copy the PMD. That is, the kernelmode and usermode * tables will share the last-level page tables of this * address range */ *target_pmd = *pmd; addr = round_up(addr + 1, PMD_SIZE); } else if (level == PTI_CLONE_PTE) { /* Walk the page-table down to the pte level */ pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { addr = round_up(addr + 1, PAGE_SIZE); continue; } /* Only clone present PTEs */ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) return; /* Allocate PTE in the user page-table */ target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; /* Set GLOBAL bit in both PTEs */ if (boot_cpu_has(X86_FEATURE_PGE)) *pte = pte_set_flags(*pte, _PAGE_GLOBAL); /* Clone the PTE */ *target_pte = *pte; addr = round_up(addr + 1, PAGE_SIZE); } else { BUG(); } } } #ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. */ static void __init pti_clone_p4d(unsigned long addr) { p4d_t *kernel_p4d, *user_p4d; pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); if (!user_p4d) return; kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; } /* * Clone the CPU_ENTRY_AREA and associated data into the user space visible * page table. */ static void __init pti_clone_user_shared(void) { unsigned int cpu; pti_clone_p4d(CPU_ENTRY_AREA_BASE); for_each_possible_cpu(cpu) { /* * The SYSCALL64 entry code needs one word of scratch space * in which to spill a register. It lives in the sp2 slot * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); } } #else /* CONFIG_X86_64 */ /* * On 32 bit PAE systems with 1GB of Kernel address space there is only * one pgd/p4d for the whole kernel. Cloning that would map the whole * address space into the user page-tables, making PTI useless. So clone * the page-table on the PMD level to prevent that. */ static void __init pti_clone_user_shared(void) { unsigned long start, end; start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ /* * Clone the ESPFIX P4D into the user space visible page table */ static void __init pti_setup_espfix64(void) { #ifdef CONFIG_X86_ESPFIX64 pti_clone_p4d(ESPFIX_BASE_ADDR); #endif } /* * Clone the populated PMDs of the entry text and force it RO. */ static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, PTI_LEVEL_KERNEL_IMAGE, late); } /* * Global pages and PCIDs are both ways to make kernel TLB entries * live longer, reduce TLB misses and improve kernel performance. * But, leaving all kernel text Global makes it potentially accessible * to Meltdown-style attacks which make it trivial to find gadgets or * defeat KASLR. * * Only use global pages when it is really worth it. */ static inline bool pti_kernel_image_global_ok(void) { /* * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) return false; /* * Only do global kernel image for pti=auto. Do the most * secure thing (not global) if pti=on specified. */ if (pti_mode != PTI_AUTO) return false; /* * K8 may not tolerate the cleared _PAGE_RW on the userspace * global kernel image pages. Do the safe thing (disable * global kernel image). This is unlikely to ever be * noticed because PTI is disabled by default on AMD CPUs. */ if (boot_cpu_has(X86_FEATURE_K8)) return false; /* * RANDSTRUCT derives its hardening benefits from the * attacker's lack of knowledge about the layout of kernel * data structures. Keep the kernel image non-global in * cases where RANDSTRUCT is in use to help keep the layout a * secret. */ if (IS_ENABLED(CONFIG_RANDSTRUCT)) return false; return true; } /* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally * readable on the filesystem or on the web. But, do not * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); unsigned long end_clone = (unsigned long)__end_rodata_aligned; unsigned long end_global = PFN_ALIGN((unsigned long)_etext); if (!pti_kernel_image_global_ok()) return; pr_debug("mapping partial kernel image into user address space\n"); /* * Note that this will undo _some_ of the work that * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs * that it clones, but we also need to get any PTEs in * the last level for areas that are not huge-page-aligned. */ /* Set the global bit for normal non-__init kernel text: */ set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } static void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the * actual length of the kernel. We need to clear * _PAGE_GLOBAL up to a PMD boundary, not just to the end * of the image. */ unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); /* * This clears _PAGE_GLOBAL from the entire kernel image. * pti_clone_kernel_text() map put _PAGE_GLOBAL back for * areas that are mapped to userspace. */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } /* * Initialize kernel page table isolation */ void __init pti_init(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); #ifdef CONFIG_X86_32 /* * We check for X86_FEATURE_PCID here. But the init-code will * clear the feature flag on 32 bit because the feature is not * supported on 32 bit anyway. To print the warning we need to * check with cpuid directly again. */ if (cpuid_ecx(0x1) & BIT(17)) { /* Use printk to work around pr_fmt() */ printk(KERN_WARNING "\n"); printk(KERN_WARNING "************************************************************\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "************************************************************\n"); } #endif pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); /* Replace some of the global bits just for shared entry text: */ /* * This is very early in boot. Device and Late initcalls can do * modprobe before free_initmem() and mark_readonly(). This * pti_clone_entry_text() allows those user-mode-helpers to function, * but notably the text is still RW. */ pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } /* * Finalize the kernel mappings in the userspace page-table. Some of the * mappings for the kernel image might have changed since pti_init() * cloned them. This is because parts of the kernel image have been * mapped RO and/or NX. These changes need to be cloned again to the * userspace page-table. */ void pti_finalize(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * This is after free_initmem() (all initcalls are done) and we've done * mark_readonly(). Text is now NX which might've split some PMDs * relative to the early clone. */ pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); } |
| 2 3 3 3 2 3 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: utcopy - Internal to external object translation utilities * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acnamesp.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utcopy") /* Local prototypes */ static acpi_status acpi_ut_copy_isimple_to_esimple(union acpi_operand_object *internal_object, union acpi_object *external_object, u8 *data_space, acpi_size *buffer_space_used); static acpi_status acpi_ut_copy_ielement_to_ielement(u8 object_type, union acpi_operand_object *source_object, union acpi_generic_state *state, void *context); static acpi_status acpi_ut_copy_ipackage_to_epackage(union acpi_operand_object *internal_object, u8 *buffer, acpi_size *space_used); static acpi_status acpi_ut_copy_esimple_to_isimple(union acpi_object *user_obj, union acpi_operand_object **return_obj); static acpi_status acpi_ut_copy_epackage_to_ipackage(union acpi_object *external_object, union acpi_operand_object **internal_object); static acpi_status acpi_ut_copy_simple_object(union acpi_operand_object *source_desc, union acpi_operand_object *dest_desc); static acpi_status acpi_ut_copy_ielement_to_eelement(u8 object_type, union acpi_operand_object *source_object, union acpi_generic_state *state, void *context); static acpi_status acpi_ut_copy_ipackage_to_ipackage(union acpi_operand_object *source_obj, union acpi_operand_object *dest_obj, struct acpi_walk_state *walk_state); /******************************************************************************* * * FUNCTION: acpi_ut_copy_isimple_to_esimple * * PARAMETERS: internal_object - Source object to be copied * external_object - Where to return the copied object * data_space - Where object data is returned (such as * buffer and string data) * buffer_space_used - Length of data_space that was used * * RETURN: Status * * DESCRIPTION: This function is called to copy a simple internal object to * an external object. * * The data_space buffer is assumed to have sufficient space for * the object. * ******************************************************************************/ static acpi_status acpi_ut_copy_isimple_to_esimple(union acpi_operand_object *internal_object, union acpi_object *external_object, u8 *data_space, acpi_size *buffer_space_used) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(ut_copy_isimple_to_esimple); *buffer_space_used = 0; /* * Check for NULL object case (could be an uninitialized * package element) */ if (!internal_object) { return_ACPI_STATUS(AE_OK); } /* Always clear the external object */ memset(external_object, 0, sizeof(union acpi_object)); /* * In general, the external object will be the same type as * the internal object */ external_object->type = internal_object->common.type; /* However, only a limited number of external types are supported */ switch (internal_object->common.type) { case ACPI_TYPE_STRING: external_object->string.pointer = (char *)data_space; external_object->string.length = internal_object->string.length; *buffer_space_used = ACPI_ROUND_UP_TO_NATIVE_WORD((acpi_size) internal_object-> string. length + 1); memcpy((void *)data_space, (void *)internal_object->string.pointer, (acpi_size)internal_object->string.length + 1); break; case ACPI_TYPE_BUFFER: external_object->buffer.pointer = data_space; external_object->buffer.length = internal_object->buffer.length; *buffer_space_used = ACPI_ROUND_UP_TO_NATIVE_WORD(internal_object->string. length); memcpy((void *)data_space, (void *)internal_object->buffer.pointer, internal_object->buffer.length); break; case ACPI_TYPE_INTEGER: external_object->integer.value = internal_object->integer.value; break; case ACPI_TYPE_LOCAL_REFERENCE: /* This is an object reference. */ switch (internal_object->reference.class) { case ACPI_REFCLASS_NAME: /* * For namepath, return the object handle ("reference") * We are referring to the namespace node */ external_object->reference.handle = internal_object->reference.node; external_object->reference.actual_type = acpi_ns_get_type(internal_object->reference.node); break; default: /* All other reference types are unsupported */ return_ACPI_STATUS(AE_TYPE); } break; case ACPI_TYPE_PROCESSOR: external_object->processor.proc_id = internal_object->processor.proc_id; external_object->processor.pblk_address = internal_object->processor.address; external_object->processor.pblk_length = internal_object->processor.length; break; case ACPI_TYPE_POWER: external_object->power_resource.system_level = internal_object->power_resource.system_level; external_object->power_resource.resource_order = internal_object->power_resource.resource_order; break; default: /* * There is no corresponding external object type */ ACPI_ERROR((AE_INFO, "Unsupported object type, cannot convert to external object: %s", acpi_ut_get_type_name(internal_object->common. type))); return_ACPI_STATUS(AE_SUPPORT); } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_ielement_to_eelement * * PARAMETERS: acpi_pkg_callback * * RETURN: Status * * DESCRIPTION: Copy one package element to another package element * ******************************************************************************/ static acpi_status acpi_ut_copy_ielement_to_eelement(u8 object_type, union acpi_operand_object *source_object, union acpi_generic_state *state, void *context) { acpi_status status = AE_OK; struct acpi_pkg_info *info = (struct acpi_pkg_info *)context; acpi_size object_space; u32 this_index; union acpi_object *target_object; ACPI_FUNCTION_ENTRY(); this_index = state->pkg.index; target_object = (union acpi_object *)&((union acpi_object *) (state->pkg.dest_object))-> package.elements[this_index]; switch (object_type) { case ACPI_COPY_TYPE_SIMPLE: /* * This is a simple or null object */ status = acpi_ut_copy_isimple_to_esimple(source_object, target_object, info->free_space, &object_space); if (ACPI_FAILURE(status)) { return (status); } break; case ACPI_COPY_TYPE_PACKAGE: /* * Build the package object */ target_object->type = ACPI_TYPE_PACKAGE; target_object->package.count = source_object->package.count; target_object->package.elements = ACPI_CAST_PTR(union acpi_object, info->free_space); /* * Pass the new package object back to the package walk routine */ state->pkg.this_target_obj = target_object; /* * Save space for the array of objects (Package elements) * update the buffer length counter */ object_space = ACPI_ROUND_UP_TO_NATIVE_WORD((acpi_size) target_object-> package.count * sizeof(union acpi_object)); break; default: return (AE_BAD_PARAMETER); } info->free_space += object_space; info->length += object_space; return (status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_ipackage_to_epackage * * PARAMETERS: internal_object - Pointer to the object we are returning * buffer - Where the object is returned * space_used - Where the object length is returned * * RETURN: Status * * DESCRIPTION: This function is called to place a package object in a user * buffer. A package object by definition contains other objects. * * The buffer is assumed to have sufficient space for the object. * The caller must have verified the buffer length needed using * the acpi_ut_get_object_size function before calling this function. * ******************************************************************************/ static acpi_status acpi_ut_copy_ipackage_to_epackage(union acpi_operand_object *internal_object, u8 *buffer, acpi_size *space_used) { union acpi_object *external_object; acpi_status status; struct acpi_pkg_info info; ACPI_FUNCTION_TRACE(ut_copy_ipackage_to_epackage); /* * First package at head of the buffer */ external_object = ACPI_CAST_PTR(union acpi_object, buffer); /* * Free space begins right after the first package */ info.length = ACPI_ROUND_UP_TO_NATIVE_WORD(sizeof(union acpi_object)); info.free_space = buffer + ACPI_ROUND_UP_TO_NATIVE_WORD(sizeof(union acpi_object)); info.object_space = 0; info.num_packages = 1; external_object->type = internal_object->common.type; external_object->package.count = internal_object->package.count; external_object->package.elements = ACPI_CAST_PTR(union acpi_object, info.free_space); /* * Leave room for an array of ACPI_OBJECTS in the buffer * and move the free space past it */ info.length += (acpi_size)external_object->package.count * ACPI_ROUND_UP_TO_NATIVE_WORD(sizeof(union acpi_object)); info.free_space += external_object->package.count * ACPI_ROUND_UP_TO_NATIVE_WORD(sizeof(union acpi_object)); status = acpi_ut_walk_package_tree(internal_object, external_object, acpi_ut_copy_ielement_to_eelement, &info); *space_used = info.length; return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_iobject_to_eobject * * PARAMETERS: internal_object - The internal object to be converted * ret_buffer - Where the object is returned * * RETURN: Status * * DESCRIPTION: This function is called to build an API object to be returned * to the caller. * ******************************************************************************/ acpi_status acpi_ut_copy_iobject_to_eobject(union acpi_operand_object *internal_object, struct acpi_buffer *ret_buffer) { acpi_status status; ACPI_FUNCTION_TRACE(ut_copy_iobject_to_eobject); if (internal_object->common.type == ACPI_TYPE_PACKAGE) { /* * Package object: Copy all subobjects (including * nested packages) */ status = acpi_ut_copy_ipackage_to_epackage(internal_object, ret_buffer->pointer, &ret_buffer->length); } else { /* * Build a simple object (no nested objects) */ status = acpi_ut_copy_isimple_to_esimple(internal_object, ACPI_CAST_PTR(union acpi_object, ret_buffer-> pointer), ACPI_ADD_PTR(u8, ret_buffer-> pointer, ACPI_ROUND_UP_TO_NATIVE_WORD (sizeof (union acpi_object))), &ret_buffer->length); /* * build simple does not include the object size in the length * so we add it in here */ ret_buffer->length += sizeof(union acpi_object); } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_esimple_to_isimple * * PARAMETERS: external_object - The external object to be converted * ret_internal_object - Where the internal object is returned * * RETURN: Status * * DESCRIPTION: This function copies an external object to an internal one. * NOTE: Pointers can be copied, we don't need to copy data. * (The pointers have to be valid in our address space no matter * what we do with them!) * ******************************************************************************/ static acpi_status acpi_ut_copy_esimple_to_isimple(union acpi_object *external_object, union acpi_operand_object **ret_internal_object) { union acpi_operand_object *internal_object; ACPI_FUNCTION_TRACE(ut_copy_esimple_to_isimple); /* * Simple types supported are: String, Buffer, Integer */ switch (external_object->type) { case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: case ACPI_TYPE_INTEGER: case ACPI_TYPE_LOCAL_REFERENCE: internal_object = acpi_ut_create_internal_object((u8) external_object-> type); if (!internal_object) { return_ACPI_STATUS(AE_NO_MEMORY); } break; case ACPI_TYPE_ANY: /* This is the case for a NULL object */ *ret_internal_object = NULL; return_ACPI_STATUS(AE_OK); default: /* All other types are not supported */ ACPI_ERROR((AE_INFO, "Unsupported object type, cannot convert to internal object: %s", acpi_ut_get_type_name(external_object->type))); return_ACPI_STATUS(AE_SUPPORT); } /* Must COPY string and buffer contents */ switch (external_object->type) { case ACPI_TYPE_STRING: internal_object->string.pointer = ACPI_ALLOCATE_ZEROED((acpi_size) external_object->string.length + 1); if (!internal_object->string.pointer) { goto error_exit; } memcpy(internal_object->string.pointer, external_object->string.pointer, external_object->string.length); internal_object->string.length = external_object->string.length; break; case ACPI_TYPE_BUFFER: internal_object->buffer.pointer = ACPI_ALLOCATE_ZEROED(external_object->buffer.length); if (!internal_object->buffer.pointer) { goto error_exit; } memcpy(internal_object->buffer.pointer, external_object->buffer.pointer, external_object->buffer.length); internal_object->buffer.length = external_object->buffer.length; /* Mark buffer data valid */ internal_object->buffer.flags |= AOPOBJ_DATA_VALID; break; case ACPI_TYPE_INTEGER: internal_object->integer.value = external_object->integer.value; break; case ACPI_TYPE_LOCAL_REFERENCE: /* An incoming reference is defined to be a namespace node */ internal_object->reference.class = ACPI_REFCLASS_REFOF; internal_object->reference.object = external_object->reference.handle; break; default: /* Other types can't get here */ break; } *ret_internal_object = internal_object; return_ACPI_STATUS(AE_OK); error_exit: acpi_ut_remove_reference(internal_object); return_ACPI_STATUS(AE_NO_MEMORY); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_epackage_to_ipackage * * PARAMETERS: external_object - The external object to be converted * internal_object - Where the internal object is returned * * RETURN: Status * * DESCRIPTION: Copy an external package object to an internal package. * Handles nested packages. * ******************************************************************************/ static acpi_status acpi_ut_copy_epackage_to_ipackage(union acpi_object *external_object, union acpi_operand_object **internal_object) { acpi_status status = AE_OK; union acpi_operand_object *package_object; union acpi_operand_object **package_elements; u32 i; ACPI_FUNCTION_TRACE(ut_copy_epackage_to_ipackage); /* Create the package object */ package_object = acpi_ut_create_package_object(external_object->package.count); if (!package_object) { return_ACPI_STATUS(AE_NO_MEMORY); } package_elements = package_object->package.elements; /* * Recursive implementation. Probably ok, since nested external * packages as parameters should be very rare. */ for (i = 0; i < external_object->package.count; i++) { status = acpi_ut_copy_eobject_to_iobject(&external_object->package. elements[i], &package_elements[i]); if (ACPI_FAILURE(status)) { /* Truncate package and delete it */ package_object->package.count = i; package_elements[i] = NULL; acpi_ut_remove_reference(package_object); return_ACPI_STATUS(status); } } /* Mark package data valid */ package_object->package.flags |= AOPOBJ_DATA_VALID; *internal_object = package_object; return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_eobject_to_iobject * * PARAMETERS: external_object - The external object to be converted * internal_object - Where the internal object is returned * * RETURN: Status * * DESCRIPTION: Converts an external object to an internal object. * ******************************************************************************/ acpi_status acpi_ut_copy_eobject_to_iobject(union acpi_object *external_object, union acpi_operand_object **internal_object) { acpi_status status; ACPI_FUNCTION_TRACE(ut_copy_eobject_to_iobject); if (external_object->type == ACPI_TYPE_PACKAGE) { status = acpi_ut_copy_epackage_to_ipackage(external_object, internal_object); } else { /* * Build a simple object (no nested objects) */ status = acpi_ut_copy_esimple_to_isimple(external_object, internal_object); } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_simple_object * * PARAMETERS: source_desc - The internal object to be copied * dest_desc - New target object * * RETURN: Status * * DESCRIPTION: Simple copy of one internal object to another. Reference count * of the destination object is preserved. * ******************************************************************************/ static acpi_status acpi_ut_copy_simple_object(union acpi_operand_object *source_desc, union acpi_operand_object *dest_desc) { u16 reference_count; union acpi_operand_object *next_object; acpi_status status; acpi_size copy_size; /* Save fields from destination that we don't want to overwrite */ reference_count = dest_desc->common.reference_count; next_object = dest_desc->common.next_object; /* * Copy the entire source object over the destination object. * Note: Source can be either an operand object or namespace node. */ copy_size = sizeof(union acpi_operand_object); if (ACPI_GET_DESCRIPTOR_TYPE(source_desc) == ACPI_DESC_TYPE_NAMED) { copy_size = sizeof(struct acpi_namespace_node); } memcpy(ACPI_CAST_PTR(char, dest_desc), ACPI_CAST_PTR(char, source_desc), copy_size); /* Restore the saved fields */ dest_desc->common.reference_count = reference_count; dest_desc->common.next_object = next_object; /* New object is not static, regardless of source */ dest_desc->common.flags &= ~AOPOBJ_STATIC_POINTER; /* Handle the objects with extra data */ switch (dest_desc->common.type) { case ACPI_TYPE_BUFFER: /* * Allocate and copy the actual buffer if and only if: * 1) There is a valid buffer pointer * 2) The buffer has a length > 0 */ if ((source_desc->buffer.pointer) && (source_desc->buffer.length)) { dest_desc->buffer.pointer = ACPI_ALLOCATE(source_desc->buffer.length); if (!dest_desc->buffer.pointer) { return (AE_NO_MEMORY); } /* Copy the actual buffer data */ memcpy(dest_desc->buffer.pointer, source_desc->buffer.pointer, source_desc->buffer.length); } break; case ACPI_TYPE_STRING: /* * Allocate and copy the actual string if and only if: * 1) There is a valid string pointer * (Pointer to a NULL string is allowed) */ if (source_desc->string.pointer) { dest_desc->string.pointer = ACPI_ALLOCATE((acpi_size)source_desc->string. length + 1); if (!dest_desc->string.pointer) { return (AE_NO_MEMORY); } /* Copy the actual string data */ memcpy(dest_desc->string.pointer, source_desc->string.pointer, (acpi_size)source_desc->string.length + 1); } break; case ACPI_TYPE_LOCAL_REFERENCE: /* * We copied the reference object, so we now must add a reference * to the object pointed to by the reference * * DDBHandle reference (from Load/load_table) is a special reference, * it does not have a Reference.Object, so does not need to * increase the reference count */ if (source_desc->reference.class == ACPI_REFCLASS_TABLE) { break; } acpi_ut_add_reference(source_desc->reference.object); break; case ACPI_TYPE_REGION: /* * We copied the Region Handler, so we now must add a reference */ if (dest_desc->region.handler) { acpi_ut_add_reference(dest_desc->region.handler); } break; /* * For Mutex and Event objects, we cannot simply copy the underlying * OS object. We must create a new one. */ case ACPI_TYPE_MUTEX: status = acpi_os_create_mutex(&dest_desc->mutex.os_mutex); if (ACPI_FAILURE(status)) { return (status); } break; case ACPI_TYPE_EVENT: status = acpi_os_create_semaphore(ACPI_NO_UNIT_LIMIT, 0, &dest_desc->event. os_semaphore); if (ACPI_FAILURE(status)) { return (status); } break; default: /* Nothing to do for other simple objects */ break; } return (AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_ielement_to_ielement * * PARAMETERS: acpi_pkg_callback * * RETURN: Status * * DESCRIPTION: Copy one package element to another package element * ******************************************************************************/ static acpi_status acpi_ut_copy_ielement_to_ielement(u8 object_type, union acpi_operand_object *source_object, union acpi_generic_state *state, void *context) { acpi_status status = AE_OK; u32 this_index; union acpi_operand_object **this_target_ptr; union acpi_operand_object *target_object; ACPI_FUNCTION_ENTRY(); this_index = state->pkg.index; this_target_ptr = (union acpi_operand_object **) &state->pkg.dest_object->package.elements[this_index]; switch (object_type) { case ACPI_COPY_TYPE_SIMPLE: /* A null source object indicates a (legal) null package element */ if (source_object) { /* * This is a simple object, just copy it */ target_object = acpi_ut_create_internal_object(source_object-> common.type); if (!target_object) { return (AE_NO_MEMORY); } status = acpi_ut_copy_simple_object(source_object, target_object); if (ACPI_FAILURE(status)) { goto error_exit; } *this_target_ptr = target_object; } else { /* Pass through a null element */ *this_target_ptr = NULL; } break; case ACPI_COPY_TYPE_PACKAGE: /* * This object is a package - go down another nesting level * Create and build the package object */ target_object = acpi_ut_create_package_object(source_object->package.count); if (!target_object) { return (AE_NO_MEMORY); } target_object->common.flags = source_object->common.flags; /* Pass the new package object back to the package walk routine */ state->pkg.this_target_obj = target_object; /* Store the object pointer in the parent package object */ *this_target_ptr = target_object; break; default: return (AE_BAD_PARAMETER); } return (status); error_exit: acpi_ut_remove_reference(target_object); return (status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_ipackage_to_ipackage * * PARAMETERS: source_obj - Pointer to the source package object * dest_obj - Where the internal object is returned * walk_state - Current Walk state descriptor * * RETURN: Status * * DESCRIPTION: This function is called to copy an internal package object * into another internal package object. * ******************************************************************************/ static acpi_status acpi_ut_copy_ipackage_to_ipackage(union acpi_operand_object *source_obj, union acpi_operand_object *dest_obj, struct acpi_walk_state *walk_state) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(ut_copy_ipackage_to_ipackage); dest_obj->common.type = source_obj->common.type; dest_obj->common.flags = source_obj->common.flags; dest_obj->package.count = source_obj->package.count; /* * Create the object array and walk the source package tree */ dest_obj->package.elements = ACPI_ALLOCATE_ZEROED(((acpi_size) source_obj->package. count + 1) * sizeof(void *)); if (!dest_obj->package.elements) { ACPI_ERROR((AE_INFO, "Package allocation failure")); return_ACPI_STATUS(AE_NO_MEMORY); } /* * Copy the package element-by-element by walking the package "tree". * This handles nested packages of arbitrary depth. */ status = acpi_ut_walk_package_tree(source_obj, dest_obj, acpi_ut_copy_ielement_to_ielement, walk_state); return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ut_copy_iobject_to_iobject * * PARAMETERS: source_desc - The internal object to be copied * dest_desc - Where the copied object is returned * walk_state - Current walk state * * RETURN: Status * * DESCRIPTION: Copy an internal object to a new internal object * ******************************************************************************/ acpi_status acpi_ut_copy_iobject_to_iobject(union acpi_operand_object *source_desc, union acpi_operand_object **dest_desc, struct acpi_walk_state *walk_state) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(ut_copy_iobject_to_iobject); /* Create the top level object */ *dest_desc = acpi_ut_create_internal_object(source_desc->common.type); if (!*dest_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } /* Copy the object and possible subobjects */ if (source_desc->common.type == ACPI_TYPE_PACKAGE) { status = acpi_ut_copy_ipackage_to_ipackage(source_desc, *dest_desc, walk_state); } else { status = acpi_ut_copy_simple_object(source_desc, *dest_desc); } /* Delete the allocated object if copy failed */ if (ACPI_FAILURE(status)) { acpi_ut_remove_reference(*dest_desc); } return_ACPI_STATUS(status); } |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | // SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/configs.c * Echo the kernel .config file used to build the kernel * * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> * Copyright (C) 2002 Hewlett-Packard Company */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/uaccess.h> /* * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from * a binary kernel image or a module. See scripts/extract-ikconfig. */ asm ( " .pushsection .rodata, \"a\" \n" " .ascii \"IKCFG_ST\" \n" " .global kernel_config_data \n" "kernel_config_data: \n" " .incbin \"kernel/config_data.gz\" \n" " .global kernel_config_data_end \n" "kernel_config_data_end: \n" " .ascii \"IKCFG_ED\" \n" " .popsection \n" ); #ifdef CONFIG_IKCONFIG_PROC extern char kernel_config_data; extern char kernel_config_data_end; static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { return simple_read_from_buffer(buf, len, offset, &kernel_config_data, &kernel_config_data_end - &kernel_config_data); } static const struct proc_ops config_gz_proc_ops = { .proc_read = ikconfig_read_current, .proc_lseek = default_llseek, }; static int __init ikconfig_init(void) { struct proc_dir_entry *entry; /* create the current config file */ entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, &config_gz_proc_ops); if (!entry) return -ENOMEM; proc_set_size(entry, &kernel_config_data_end - &kernel_config_data); return 0; } static void __exit ikconfig_cleanup(void) { remove_proc_entry("config.gz", NULL); } module_init(ikconfig_init); module_exit(ikconfig_cleanup); #endif /* CONFIG_IKCONFIG_PROC */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Randy Dunlap"); MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); |
| 104 785 105 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> #include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void flow_offload_ct_tcp(struct nf_conn *ct) { /* conntrack will not see all packets, disable tcp window validation. */ spin_lock_bh(&ct->lock); ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; spin_unlock_bh(&ct->lock); } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->net, ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module"); |
| 20 20 20 2 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | /* mpiutil.ac - Utility functions for MPI * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include <linux/export.h> #include "mpi-internal.h" /**************** * Note: It was a bad idea to use the number of limbs to allocate * because on a alpha the limbs are large but we normally need * integers of n bits - So we should change this to bits (or bytes). * * But mpi_alloc is used in a lot of places :-) */ MPI mpi_alloc(unsigned nlimbs) { MPI a; a = kmalloc_obj(*a); if (!a) return a; if (nlimbs) { a->d = mpi_alloc_limb_space(nlimbs); if (!a->d) { kfree(a); return NULL; } } else { a->d = NULL; } a->alloced = nlimbs; a->nlimbs = 0; a->sign = 0; a->flags = 0; a->nbits = 0; return a; } EXPORT_SYMBOL_GPL(mpi_alloc); mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs) { size_t len = nlimbs * sizeof(mpi_limb_t); if (!len) return NULL; return kmalloc(len, GFP_KERNEL); } void mpi_free_limb_space(mpi_ptr_t a) { if (!a) return; kfree_sensitive(a); } void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs) { mpi_free_limb_space(a->d); a->d = ap; a->alloced = nlimbs; } /**************** * Resize the array of A to NLIMBS. the additional space is cleared * (set to 0) [done by m_realloc()] */ int mpi_resize(MPI a, unsigned nlimbs) { void *p; if (nlimbs <= a->alloced) return 0; /* no need to do it */ if (a->d) { p = kzalloc_objs(mpi_limb_t, nlimbs); if (!p) return -ENOMEM; memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); kfree_sensitive(a->d); a->d = p; } else { a->d = kzalloc_objs(mpi_limb_t, nlimbs); if (!a->d) return -ENOMEM; } a->alloced = nlimbs; return 0; } void mpi_free(MPI a) { if (!a) return; if (a->flags & 4) kfree_sensitive(a->d); else mpi_free_limb_space(a->d); if (a->flags & ~7) pr_info("invalid flag value in mpi\n"); kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); /**************** * Note: This copy function should not interpret the MPI * but copy it transparently. */ MPI mpi_copy(MPI a) { int i; MPI b; if (a) { b = mpi_alloc(a->nlimbs); if (!b) return NULL; b->nlimbs = a->nlimbs; b->sign = a->sign; b->flags = a->flags; b->flags &= ~(16|32); /* Reset the immutable and constant flags. */ for (i = 0; i < b->nlimbs; i++) b->d[i] = a->d[i]; } else b = NULL; return b; } MODULE_DESCRIPTION("Multiprecision maths library"); MODULE_LICENSE("GPL"); |
| 2 2 2 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include <asm/apic.h> #include <asm/processor.h> #include "cpu.h" static int cpu_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu); seq_printf(m, "online: %d\n", cpu_online(cpu)); if (!c->initialized) return 0; seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); seq_printf(m, "core_id: %u\n", c->topo.core_id); seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg()); seq_printf(m, "num_threads: %u\n", __num_threads_per_package); seq_printf(m, "num_cores: %u\n", __num_cores_per_package); seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package); seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core); return 0; } static int cpu_debug_open(struct inode *inode, struct file *file) { return single_open(file, cpu_debug_show, inode->i_private); } static const struct file_operations dfs_cpu_ops = { .open = cpu_debug_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int dom_debug_show(struct seq_file *m, void *p) { static const char *domain_names[TOPO_MAX_DOMAIN] = { [TOPO_SMT_DOMAIN] = "Thread", [TOPO_CORE_DOMAIN] = "Core", [TOPO_MODULE_DOMAIN] = "Module", [TOPO_TILE_DOMAIN] = "Tile", [TOPO_DIE_DOMAIN] = "Die", [TOPO_DIEGRP_DOMAIN] = "DieGrp", [TOPO_PKG_DOMAIN] = "Package", }; unsigned int dom, nthreads = 1; for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) { nthreads *= x86_topo_system.dom_size[dom]; seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n", domain_names[dom], x86_topo_system.dom_shifts[dom], x86_topo_system.dom_size[dom], nthreads); } return 0; } static int dom_debug_open(struct inode *inode, struct file *file) { return single_open(file, dom_debug_show, inode->i_private); } static const struct file_operations dfs_dom_ops = { .open = dom_debug_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static __init int cpu_init_debugfs(void) { struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); unsigned long id; char name[24]; debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops); dir = debugfs_create_dir("cpus", base); for_each_possible_cpu(id) { sprintf(name, "%lu", id); debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops); } return 0; } late_initcall(cpu_init_debugfs); |
| 2 1 96 783 784 440 440 2 1 2 2 1 1 2 440 4 440 440 440 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 | // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/netdev_lock.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable *rth; struct rt6_info *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc_obj(*me, flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { unsigned int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_dstats_rx_add(dev, len); else dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int len = skb->len; netdev_tx_t ret; ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_dstats_tx_add(dev, len); else dev_dstats_tx_dropped(dev); return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct rt6_info *rt6; rt6 = vrf->rt6; dst_hold(&rt6->dst); skb_dst_drop(skb); skb_dst_set(skb, &rt6->dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = vrf->rt6; if (rt6) { dst_dev_put(&rt6->dst); dst_release(&rt6->dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; vrf->rt6 = rt6; rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct rtable *rth; rth = vrf->rth; dst_hold(&rth->dst); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = vrf->rth; dst_dev_put(&rth->dst); dst_release(&rth->dst); } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; vrf->rth = rth; return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; synchronize_net(); return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev, bool needs_sync) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; /* Make sure that concurrent RCU readers that identified the device * as a VRF port see a VRF master or no master at all. */ if (needs_sync) synchronize_net(); cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev, true); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; skb_dst_drop(skb); rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); if (add_it) { err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->netns_immutable = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { |